summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/util
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/util
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/util')
-rw-r--r--src/rocksdb/util/aligned_buffer.h248
-rw-r--r--src/rocksdb/util/autovector.h361
-rw-r--r--src/rocksdb/util/autovector_test.cc330
-rw-r--r--src/rocksdb/util/bloom_impl.h483
-rw-r--r--src/rocksdb/util/bloom_test.cc912
-rw-r--r--src/rocksdb/util/build_version.cc.in5
-rw-r--r--src/rocksdb/util/build_version.h15
-rw-r--r--src/rocksdb/util/cast_util.h21
-rw-r--r--src/rocksdb/util/channel.h67
-rw-r--r--src/rocksdb/util/coding.cc89
-rw-r--r--src/rocksdb/util/coding.h480
-rw-r--r--src/rocksdb/util/coding_test.cc217
-rw-r--r--src/rocksdb/util/compaction_job_stats_impl.cc91
-rw-r--r--src/rocksdb/util/comparator.cc216
-rw-r--r--src/rocksdb/util/compression.h1407
-rw-r--r--src/rocksdb/util/compression_context_cache.cc108
-rw-r--r--src/rocksdb/util/compression_context_cache.h47
-rw-r--r--src/rocksdb/util/concurrent_task_limiter_impl.cc67
-rw-r--r--src/rocksdb/util/concurrent_task_limiter_impl.h67
-rw-r--r--src/rocksdb/util/core_local.h83
-rw-r--r--src/rocksdb/util/crc32c.cc1263
-rw-r--r--src/rocksdb/util/crc32c.h51
-rw-r--r--src/rocksdb/util/crc32c_arm64.cc165
-rw-r--r--src/rocksdb/util/crc32c_arm64.h48
-rw-r--r--src/rocksdb/util/crc32c_ppc.c94
-rw-r--r--src/rocksdb/util/crc32c_ppc.h19
-rw-r--r--src/rocksdb/util/crc32c_ppc_asm.S752
-rw-r--r--src/rocksdb/util/crc32c_ppc_constants.h900
-rw-r--r--src/rocksdb/util/crc32c_test.cc180
-rw-r--r--src/rocksdb/util/defer.h52
-rw-r--r--src/rocksdb/util/defer_test.cc39
-rw-r--r--src/rocksdb/util/duplicate_detector.h68
-rw-r--r--src/rocksdb/util/dynamic_bloom.cc70
-rw-r--r--src/rocksdb/util/dynamic_bloom.h214
-rw-r--r--src/rocksdb/util/dynamic_bloom_test.cc324
-rw-r--r--src/rocksdb/util/file_checksum_helper.cc85
-rw-r--r--src/rocksdb/util/file_checksum_helper.h117
-rw-r--r--src/rocksdb/util/file_reader_writer_test.cc444
-rw-r--r--src/rocksdb/util/filelock_test.cc141
-rw-r--r--src/rocksdb/util/filter_bench.cc751
-rw-r--r--src/rocksdb/util/gflags_compat.h19
-rw-r--r--src/rocksdb/util/hash.cc83
-rw-r--r--src/rocksdb/util/hash.h120
-rw-r--r--src/rocksdb/util/hash_map.h67
-rw-r--r--src/rocksdb/util/hash_test.cc377
-rw-r--r--src/rocksdb/util/heap.h166
-rw-r--r--src/rocksdb/util/heap_test.cc139
-rw-r--r--src/rocksdb/util/kv_map.h33
-rw-r--r--src/rocksdb/util/log_write_bench.cc86
-rw-r--r--src/rocksdb/util/murmurhash.cc191
-rw-r--r--src/rocksdb/util/murmurhash.h42
-rw-r--r--src/rocksdb/util/mutexlock.h135
-rw-r--r--src/rocksdb/util/ppc-opcode.h27
-rw-r--r--src/rocksdb/util/random.cc38
-rw-r--r--src/rocksdb/util/random.h166
-rw-r--r--src/rocksdb/util/random_test.cc105
-rw-r--r--src/rocksdb/util/rate_limiter.cc339
-rw-r--r--src/rocksdb/util/rate_limiter.h113
-rw-r--r--src/rocksdb/util/rate_limiter_test.cc235
-rw-r--r--src/rocksdb/util/repeatable_thread.h149
-rw-r--r--src/rocksdb/util/repeatable_thread_test.cc107
-rw-r--r--src/rocksdb/util/set_comparator.h22
-rw-r--r--src/rocksdb/util/slice.cc243
-rw-r--r--src/rocksdb/util/slice_test.cc163
-rw-r--r--src/rocksdb/util/slice_transform_test.cc153
-rw-r--r--src/rocksdb/util/status.cc143
-rw-r--r--src/rocksdb/util/stderr_logger.h31
-rw-r--r--src/rocksdb/util/stop_watch.h118
-rw-r--r--src/rocksdb/util/string_util.cc409
-rw-r--r--src/rocksdb/util/string_util.h138
-rw-r--r--src/rocksdb/util/thread_list_test.cc352
-rw-r--r--src/rocksdb/util/thread_local.cc554
-rw-r--r--src/rocksdb/util/thread_local.h101
-rw-r--r--src/rocksdb/util/thread_local_test.cc580
-rw-r--r--src/rocksdb/util/thread_operation.h121
-rw-r--r--src/rocksdb/util/threadpool_imp.cc507
-rw-r--r--src/rocksdb/util/threadpool_imp.h112
-rw-r--r--src/rocksdb/util/timer_queue.h230
-rw-r--r--src/rocksdb/util/timer_queue_test.cc72
-rw-r--r--src/rocksdb/util/user_comparator_wrapper.h65
-rw-r--r--src/rocksdb/util/util.h16
-rw-r--r--src/rocksdb/util/vector_iterator.h101
-rw-r--r--src/rocksdb/util/xxh3p.h1648
-rw-r--r--src/rocksdb/util/xxhash.cc1160
-rw-r--r--src/rocksdb/util/xxhash.h598
85 files changed, 21465 insertions, 0 deletions
diff --git a/src/rocksdb/util/aligned_buffer.h b/src/rocksdb/util/aligned_buffer.h
new file mode 100644
index 000000000..c7b0728c9
--- /dev/null
+++ b/src/rocksdb/util/aligned_buffer.h
@@ -0,0 +1,248 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <algorithm>
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This file contains utilities to handle the alignment of pages and buffers.
+
+// Truncate to a multiple of page_size, which is also a page boundary. This
+// helps to figuring out the right alignment.
+// Example:
+// TruncateToPageBoundary(5000, 4096) => 4096
+// TruncateToPageBoundary(10000, 4096) => 8192
+inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
+ s -= (s & (page_size - 1));
+ assert((s % page_size) == 0);
+ return s;
+}
+
+// Round up x to a multiple of y.
+// Example:
+// Roundup(13, 5) => 15
+// Roundup(201, 16) => 208
+inline size_t Roundup(size_t x, size_t y) {
+ return ((x + y - 1) / y) * y;
+}
+
+// Round down x to a multiple of y.
+// Example:
+// Rounddown(13, 5) => 10
+// Rounddown(201, 16) => 192
+inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; }
+
+// AlignedBuffer manages a buffer by taking alignment into consideration, and
+// aligns the buffer start and end positions. It is mainly used for direct I/O,
+// though it can be used other purposes as well.
+// It also supports expanding the managed buffer, and copying whole or part of
+// the data from old buffer into the new expanded buffer. Such a copy especially
+// helps in cases avoiding an IO to re-fetch the data from disk.
+//
+// Example:
+// AlignedBuffer buf;
+// buf.Alignment(alignment);
+// buf.AllocateNewBuffer(user_requested_buf_size);
+// ...
+// buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true,
+// copy_offset, copy_len);
+class AlignedBuffer {
+ size_t alignment_;
+ std::unique_ptr<char[]> buf_;
+ size_t capacity_;
+ size_t cursize_;
+ char* bufstart_;
+
+public:
+ AlignedBuffer()
+ : alignment_(),
+ capacity_(0),
+ cursize_(0),
+ bufstart_(nullptr) {
+ }
+
+ AlignedBuffer(AlignedBuffer&& o) ROCKSDB_NOEXCEPT {
+ *this = std::move(o);
+ }
+
+ AlignedBuffer& operator=(AlignedBuffer&& o) ROCKSDB_NOEXCEPT {
+ alignment_ = std::move(o.alignment_);
+ buf_ = std::move(o.buf_);
+ capacity_ = std::move(o.capacity_);
+ cursize_ = std::move(o.cursize_);
+ bufstart_ = std::move(o.bufstart_);
+ return *this;
+ }
+
+ AlignedBuffer(const AlignedBuffer&) = delete;
+
+ AlignedBuffer& operator=(const AlignedBuffer&) = delete;
+
+ static bool isAligned(const void* ptr, size_t alignment) {
+ return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+ }
+
+ static bool isAligned(size_t n, size_t alignment) {
+ return n % alignment == 0;
+ }
+
+ size_t Alignment() const {
+ return alignment_;
+ }
+
+ size_t Capacity() const {
+ return capacity_;
+ }
+
+ size_t CurrentSize() const {
+ return cursize_;
+ }
+
+ const char* BufferStart() const {
+ return bufstart_;
+ }
+
+ char* BufferStart() { return bufstart_; }
+
+ void Clear() {
+ cursize_ = 0;
+ }
+
+ void Alignment(size_t alignment) {
+ assert(alignment > 0);
+ assert((alignment & (alignment - 1)) == 0);
+ alignment_ = alignment;
+ }
+
+ // Allocates a new buffer and sets the start position to the first aligned
+ // byte.
+ //
+ // requested_capacity: requested new buffer capacity. This capacity will be
+ // rounded up based on alignment.
+ // copy_data: Copy data from old buffer to new buffer. If copy_offset and
+ // copy_len are not passed in and the new requested capacity is bigger
+ // than the existing buffer's capacity, the data in the exising buffer is
+ // fully copied over to the new buffer.
+ // copy_offset: Copy data from this offset in old buffer.
+ // copy_len: Number of bytes to copy.
+ //
+ // The function does nothing if the new requested_capacity is smaller than
+ // the current buffer capacity and copy_data is true i.e. the old buffer is
+ // retained as is.
+ void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
+ uint64_t copy_offset = 0, size_t copy_len = 0) {
+ assert(alignment_ > 0);
+ assert((alignment_ & (alignment_ - 1)) == 0);
+
+ copy_len = copy_len > 0 ? copy_len : cursize_;
+ if (copy_data && requested_capacity < copy_len) {
+ // If we are downsizing to a capacity that is smaller than the current
+ // data in the buffer -- Ignore the request.
+ return;
+ }
+
+ size_t new_capacity = Roundup(requested_capacity, alignment_);
+ char* new_buf = new char[new_capacity + alignment_];
+ char* new_bufstart = reinterpret_cast<char*>(
+ (reinterpret_cast<uintptr_t>(new_buf) + (alignment_ - 1)) &
+ ~static_cast<uintptr_t>(alignment_ - 1));
+
+ if (copy_data) {
+ assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_);
+ memcpy(new_bufstart, bufstart_ + copy_offset, copy_len);
+ cursize_ = copy_len;
+ } else {
+ cursize_ = 0;
+ }
+
+ bufstart_ = new_bufstart;
+ capacity_ = new_capacity;
+ buf_.reset(new_buf);
+ }
+
+ // Append to the buffer.
+ //
+ // src : source to copy the data from.
+ // append_size : number of bytes to copy from src.
+ // Returns the number of bytes appended.
+ //
+ // If append_size is more than the remaining buffer size only the
+ // remaining-size worth of bytes are copied.
+ size_t Append(const char* src, size_t append_size) {
+ size_t buffer_remaining = capacity_ - cursize_;
+ size_t to_copy = std::min(append_size, buffer_remaining);
+
+ if (to_copy > 0) {
+ memcpy(bufstart_ + cursize_, src, to_copy);
+ cursize_ += to_copy;
+ }
+ return to_copy;
+ }
+
+ // Read from the buffer.
+ //
+ // dest : destination buffer to copy the data to.
+ // offset : the buffer offset to start reading from.
+ // read_size : the number of bytes to copy from the buffer to dest.
+ // Returns the number of bytes read/copied to dest.
+ size_t Read(char* dest, size_t offset, size_t read_size) const {
+ assert(offset < cursize_);
+
+ size_t to_read = 0;
+ if(offset < cursize_) {
+ to_read = std::min(cursize_ - offset, read_size);
+ }
+ if (to_read > 0) {
+ memcpy(dest, bufstart_ + offset, to_read);
+ }
+ return to_read;
+ }
+
+ // Pad to the end of alignment with "padding"
+ void PadToAlignmentWith(int padding) {
+ size_t total_size = Roundup(cursize_, alignment_);
+ size_t pad_size = total_size - cursize_;
+
+ if (pad_size > 0) {
+ assert((pad_size + cursize_) <= capacity_);
+ memset(bufstart_ + cursize_, padding, pad_size);
+ cursize_ += pad_size;
+ }
+ }
+
+ void PadWith(size_t pad_size, int padding) {
+ assert((pad_size + cursize_) <= capacity_);
+ memset(bufstart_ + cursize_, padding, pad_size);
+ cursize_ += pad_size;
+ }
+
+ // After a partial flush move the tail to the beginning of the buffer.
+ void RefitTail(size_t tail_offset, size_t tail_size) {
+ if (tail_size > 0) {
+ memmove(bufstart_, bufstart_ + tail_offset, tail_size);
+ }
+ cursize_ = tail_size;
+ }
+
+ // Returns a place to start appending.
+ // WARNING: Note that it is possible to write past the end of the buffer if
+ // the buffer is modified without using the write APIs or encapsulation
+ // offered by AlignedBuffer. It is up to the user to guard against such
+ // errors.
+ char* Destination() {
+ return bufstart_ + cursize_;
+ }
+
+ void Size(size_t cursize) {
+ cursize_ = cursize;
+ }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/autovector.h b/src/rocksdb/util/autovector.h
new file mode 100644
index 000000000..1e6c4716b
--- /dev/null
+++ b/src/rocksdb/util/autovector.h
@@ -0,0 +1,361 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <iterator>
+#include <stdexcept>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_LITE
+template <class T, size_t kSize = 8>
+class autovector : public std::vector<T> {
+ using std::vector<T>::vector;
+};
+#else
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+// * reserve()/shrink_to_fit()
+// If used correctly, in most cases, people should not touch the
+// underlying vector at all.
+// * random insert()/erase(), please only use push_back()/pop_back().
+// * No move/swap operations. Each autovector instance has a
+// stack-allocated array and if we want support move/swap operations, we
+// need to copy the arrays other than just swapping the pointers. In this
+// case we'll just explicitly forbid these operations since they may
+// lead users to make false assumption by thinking they are inexpensive
+// operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+ // General STL-style container member types.
+ typedef T value_type;
+ typedef typename std::vector<T>::difference_type difference_type;
+ typedef typename std::vector<T>::size_type size_type;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+
+ // This class is the base for regular/const iterator
+ template <class TAutoVector, class TValueType>
+ class iterator_impl {
+ public:
+ // -- iterator traits
+ typedef iterator_impl<TAutoVector, TValueType> self_type;
+ typedef TValueType value_type;
+ typedef TValueType& reference;
+ typedef TValueType* pointer;
+ typedef typename TAutoVector::difference_type difference_type;
+ typedef std::random_access_iterator_tag iterator_category;
+
+ iterator_impl(TAutoVector* vect, size_t index)
+ : vect_(vect), index_(index) {};
+ iterator_impl(const iterator_impl&) = default;
+ ~iterator_impl() {}
+ iterator_impl& operator=(const iterator_impl&) = default;
+
+ // -- Advancement
+ // ++iterator
+ self_type& operator++() {
+ ++index_;
+ return *this;
+ }
+
+ // iterator++
+ self_type operator++(int) {
+ auto old = *this;
+ ++index_;
+ return old;
+ }
+
+ // --iterator
+ self_type& operator--() {
+ --index_;
+ return *this;
+ }
+
+ // iterator--
+ self_type operator--(int) {
+ auto old = *this;
+ --index_;
+ return old;
+ }
+
+ self_type operator-(difference_type len) const {
+ return self_type(vect_, index_ - len);
+ }
+
+ difference_type operator-(const self_type& other) const {
+ assert(vect_ == other.vect_);
+ return index_ - other.index_;
+ }
+
+ self_type operator+(difference_type len) const {
+ return self_type(vect_, index_ + len);
+ }
+
+ self_type& operator+=(difference_type len) {
+ index_ += len;
+ return *this;
+ }
+
+ self_type& operator-=(difference_type len) {
+ index_ -= len;
+ return *this;
+ }
+
+ // -- Reference
+ reference operator*() const {
+ assert(vect_->size() >= index_);
+ return (*vect_)[index_];
+ }
+
+ pointer operator->() const {
+ assert(vect_->size() >= index_);
+ return &(*vect_)[index_];
+ }
+
+ reference operator[](difference_type len) const {
+ return *(*this + len);
+ }
+
+ // -- Logical Operators
+ bool operator==(const self_type& other) const {
+ assert(vect_ == other.vect_);
+ return index_ == other.index_;
+ }
+
+ bool operator!=(const self_type& other) const { return !(*this == other); }
+
+ bool operator>(const self_type& other) const {
+ assert(vect_ == other.vect_);
+ return index_ > other.index_;
+ }
+
+ bool operator<(const self_type& other) const {
+ assert(vect_ == other.vect_);
+ return index_ < other.index_;
+ }
+
+ bool operator>=(const self_type& other) const {
+ assert(vect_ == other.vect_);
+ return index_ >= other.index_;
+ }
+
+ bool operator<=(const self_type& other) const {
+ assert(vect_ == other.vect_);
+ return index_ <= other.index_;
+ }
+
+ private:
+ TAutoVector* vect_ = nullptr;
+ size_t index_ = 0;
+ };
+
+ typedef iterator_impl<autovector, value_type> iterator;
+ typedef iterator_impl<const autovector, const value_type> const_iterator;
+ typedef std::reverse_iterator<iterator> reverse_iterator;
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+ autovector() : values_(reinterpret_cast<pointer>(buf_)) {}
+
+ autovector(std::initializer_list<T> init_list)
+ : values_(reinterpret_cast<pointer>(buf_)) {
+ for (const T& item : init_list) {
+ push_back(item);
+ }
+ }
+
+ ~autovector() { clear(); }
+
+ // -- Immutable operations
+ // Indicate if all data resides in in-stack data structure.
+ bool only_in_stack() const {
+ // If no element was inserted at all, the vector's capacity will be `0`.
+ return vect_.capacity() == 0;
+ }
+
+ size_type size() const { return num_stack_items_ + vect_.size(); }
+
+ // resize does not guarantee anything about the contents of the newly
+ // available elements
+ void resize(size_type n) {
+ if (n > kSize) {
+ vect_.resize(n - kSize);
+ while (num_stack_items_ < kSize) {
+ new ((void*)(&values_[num_stack_items_++])) value_type();
+ }
+ num_stack_items_ = kSize;
+ } else {
+ vect_.clear();
+ while (num_stack_items_ < n) {
+ new ((void*)(&values_[num_stack_items_++])) value_type();
+ }
+ while (num_stack_items_ > n) {
+ values_[--num_stack_items_].~value_type();
+ }
+ }
+ }
+
+ bool empty() const { return size() == 0; }
+
+ const_reference operator[](size_type n) const {
+ assert(n < size());
+ if (n < kSize) {
+ return values_[n];
+ }
+ return vect_[n - kSize];
+ }
+
+ reference operator[](size_type n) {
+ assert(n < size());
+ if (n < kSize) {
+ return values_[n];
+ }
+ return vect_[n - kSize];
+ }
+
+ const_reference at(size_type n) const {
+ assert(n < size());
+ return (*this)[n];
+ }
+
+ reference at(size_type n) {
+ assert(n < size());
+ return (*this)[n];
+ }
+
+ reference front() {
+ assert(!empty());
+ return *begin();
+ }
+
+ const_reference front() const {
+ assert(!empty());
+ return *begin();
+ }
+
+ reference back() {
+ assert(!empty());
+ return *(end() - 1);
+ }
+
+ const_reference back() const {
+ assert(!empty());
+ return *(end() - 1);
+ }
+
+ // -- Mutable Operations
+ void push_back(T&& item) {
+ if (num_stack_items_ < kSize) {
+ new ((void*)(&values_[num_stack_items_])) value_type();
+ values_[num_stack_items_++] = std::move(item);
+ } else {
+ vect_.push_back(item);
+ }
+ }
+
+ void push_back(const T& item) {
+ if (num_stack_items_ < kSize) {
+ new ((void*)(&values_[num_stack_items_])) value_type();
+ values_[num_stack_items_++] = item;
+ } else {
+ vect_.push_back(item);
+ }
+ }
+
+ template <class... Args>
+ void emplace_back(Args&&... args) {
+ if (num_stack_items_ < kSize) {
+ new ((void*)(&values_[num_stack_items_++]))
+ value_type(std::forward<Args>(args)...);
+ } else {
+ vect_.emplace_back(std::forward<Args>(args)...);
+ }
+ }
+
+ void pop_back() {
+ assert(!empty());
+ if (!vect_.empty()) {
+ vect_.pop_back();
+ } else {
+ values_[--num_stack_items_].~value_type();
+ }
+ }
+
+ void clear() {
+ while (num_stack_items_ > 0) {
+ values_[--num_stack_items_].~value_type();
+ }
+ vect_.clear();
+ }
+
+ // -- Copy and Assignment
+ autovector& assign(const autovector& other);
+
+ autovector(const autovector& other) { assign(other); }
+
+ autovector& operator=(const autovector& other) { return assign(other); }
+
+ // -- Iterator Operations
+ iterator begin() { return iterator(this, 0); }
+
+ const_iterator begin() const { return const_iterator(this, 0); }
+
+ iterator end() { return iterator(this, this->size()); }
+
+ const_iterator end() const { return const_iterator(this, this->size()); }
+
+ reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(end());
+ }
+
+ reverse_iterator rend() { return reverse_iterator(begin()); }
+
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(begin());
+ }
+
+ private:
+ size_type num_stack_items_ = 0; // current number of items
+ alignas(alignof(
+ value_type)) char buf_[kSize *
+ sizeof(value_type)]; // the first `kSize` items
+ pointer values_;
+ // used only if there are more than `kSize` items.
+ std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+ values_ = reinterpret_cast<pointer>(buf_);
+ // copy the internal vector
+ vect_.assign(other.vect_.begin(), other.vect_.end());
+
+ // copy array
+ num_stack_items_ = other.num_stack_items_;
+ std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+ return *this;
+}
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/autovector_test.cc b/src/rocksdb/util/autovector_test.cc
new file mode 100644
index 000000000..adddd1b02
--- /dev/null
+++ b/src/rocksdb/util/autovector_test.cc
@@ -0,0 +1,330 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <iostream>
+#include <string>
+#include <utility>
+
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/autovector.h"
+#include "util/string_util.h"
+
+using std::cout;
+using std::endl;
+
+namespace ROCKSDB_NAMESPACE {
+
+class AutoVectorTest : public testing::Test {};
+const unsigned long kSize = 8;
+
+namespace {
+template <class T>
+void AssertAutoVectorOnlyInStack(autovector<T, kSize>* vec, bool result) {
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(vec->only_in_stack(), result);
+#else
+ (void) vec;
+ (void) result;
+#endif // !ROCKSDB_LITE
+}
+} // namespace
+
+TEST_F(AutoVectorTest, PushBackAndPopBack) {
+ autovector<size_t, kSize> vec;
+ ASSERT_TRUE(vec.empty());
+ ASSERT_EQ(0ul, vec.size());
+
+ for (size_t i = 0; i < 1000 * kSize; ++i) {
+ vec.push_back(i);
+ ASSERT_TRUE(!vec.empty());
+ if (i < kSize) {
+ AssertAutoVectorOnlyInStack(&vec, true);
+ } else {
+ AssertAutoVectorOnlyInStack(&vec, false);
+ }
+ ASSERT_EQ(i + 1, vec.size());
+ ASSERT_EQ(i, vec[i]);
+ ASSERT_EQ(i, vec.at(i));
+ }
+
+ size_t size = vec.size();
+ while (size != 0) {
+ vec.pop_back();
+ // will always be in heap
+ AssertAutoVectorOnlyInStack(&vec, false);
+ ASSERT_EQ(--size, vec.size());
+ }
+
+ ASSERT_TRUE(vec.empty());
+}
+
+TEST_F(AutoVectorTest, EmplaceBack) {
+ typedef std::pair<size_t, std::string> ValType;
+ autovector<ValType, kSize> vec;
+
+ for (size_t i = 0; i < 1000 * kSize; ++i) {
+ vec.emplace_back(i, ToString(i + 123));
+ ASSERT_TRUE(!vec.empty());
+ if (i < kSize) {
+ AssertAutoVectorOnlyInStack(&vec, true);
+ } else {
+ AssertAutoVectorOnlyInStack(&vec, false);
+ }
+
+ ASSERT_EQ(i + 1, vec.size());
+ ASSERT_EQ(i, vec[i].first);
+ ASSERT_EQ(ToString(i + 123), vec[i].second);
+ }
+
+ vec.clear();
+ ASSERT_TRUE(vec.empty());
+ AssertAutoVectorOnlyInStack(&vec, false);
+}
+
+TEST_F(AutoVectorTest, Resize) {
+ autovector<size_t, kSize> vec;
+
+ vec.resize(kSize);
+ AssertAutoVectorOnlyInStack(&vec, true);
+ for (size_t i = 0; i < kSize; ++i) {
+ vec[i] = i;
+ }
+
+ vec.resize(kSize * 2);
+ AssertAutoVectorOnlyInStack(&vec, false);
+ for (size_t i = 0; i < kSize; ++i) {
+ ASSERT_EQ(vec[i], i);
+ }
+ for (size_t i = 0; i < kSize; ++i) {
+ vec[i + kSize] = i;
+ }
+
+ vec.resize(1);
+ ASSERT_EQ(1U, vec.size());
+}
+
+namespace {
+void AssertEqual(
+ const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) {
+ ASSERT_EQ(a.size(), b.size());
+ ASSERT_EQ(a.empty(), b.empty());
+#ifndef ROCKSDB_LITE
+ ASSERT_EQ(a.only_in_stack(), b.only_in_stack());
+#endif // !ROCKSDB_LITE
+ for (size_t i = 0; i < a.size(); ++i) {
+ ASSERT_EQ(a[i], b[i]);
+ }
+}
+} // namespace
+
+TEST_F(AutoVectorTest, CopyAndAssignment) {
+ // Test both heap-allocated and stack-allocated cases.
+ for (auto size : { kSize / 2, kSize * 1000 }) {
+ autovector<size_t, kSize> vec;
+ for (size_t i = 0; i < size; ++i) {
+ vec.push_back(i);
+ }
+
+ {
+ autovector<size_t, kSize> other;
+ other = vec;
+ AssertEqual(other, vec);
+ }
+
+ {
+ autovector<size_t, kSize> other(vec);
+ AssertEqual(other, vec);
+ }
+ }
+}
+
+TEST_F(AutoVectorTest, Iterators) {
+ autovector<std::string, kSize> vec;
+ for (size_t i = 0; i < kSize * 1000; ++i) {
+ vec.push_back(ToString(i));
+ }
+
+ // basic operator test
+ ASSERT_EQ(vec.front(), *vec.begin());
+ ASSERT_EQ(vec.back(), *(vec.end() - 1));
+ ASSERT_TRUE(vec.begin() < vec.end());
+
+ // non-const iterator
+ size_t index = 0;
+ for (const auto& item : vec) {
+ ASSERT_EQ(vec[index++], item);
+ }
+
+ index = vec.size() - 1;
+ for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) {
+ ASSERT_EQ(vec[index--], *pos);
+ }
+
+ // const iterator
+ const auto& cvec = vec;
+ index = 0;
+ for (const auto& item : cvec) {
+ ASSERT_EQ(cvec[index++], item);
+ }
+
+ index = vec.size() - 1;
+ for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) {
+ ASSERT_EQ(cvec[index--], *pos);
+ }
+
+ // forward and backward
+ auto pos = vec.begin();
+ while (pos != vec.end()) {
+ auto old_val = *pos;
+ auto old = pos++;
+ // HACK: make sure -> works
+ ASSERT_TRUE(!old->empty());
+ ASSERT_EQ(old_val, *old);
+ ASSERT_TRUE(pos == vec.end() || old_val != *pos);
+ }
+
+ pos = vec.begin();
+ for (size_t i = 0; i < vec.size(); i += 2) {
+ // Cannot use ASSERT_EQ since that macro depends on iostream serialization
+ ASSERT_TRUE(pos + 2 - 2 == pos);
+ pos += 2;
+ ASSERT_TRUE(pos >= vec.begin());
+ ASSERT_TRUE(pos <= vec.end());
+
+ size_t diff = static_cast<size_t>(pos - vec.begin());
+ ASSERT_EQ(i + 2, diff);
+ }
+}
+
+namespace {
+std::vector<std::string> GetTestKeys(size_t size) {
+ std::vector<std::string> keys;
+ keys.resize(size);
+
+ int index = 0;
+ for (auto& key : keys) {
+ key = "item-" + ROCKSDB_NAMESPACE::ToString(index++);
+ }
+ return keys;
+}
+} // namespace
+
+template <class TVector>
+void BenchmarkVectorCreationAndInsertion(
+ std::string name, size_t ops, size_t item_size,
+ const std::vector<typename TVector::value_type>& items) {
+ auto env = Env::Default();
+
+ int index = 0;
+ auto start_time = env->NowNanos();
+ auto ops_remaining = ops;
+ while(ops_remaining--) {
+ TVector v;
+ for (size_t i = 0; i < item_size; ++i) {
+ v.push_back(items[index++]);
+ }
+ }
+ auto elapsed = env->NowNanos() - start_time;
+ cout << "created " << ops << " " << name << " instances:\n\t"
+ << "each was inserted with " << item_size << " elements\n\t"
+ << "total time elapsed: " << elapsed << " (ns)" << endl;
+}
+
+template <class TVector>
+size_t BenchmarkSequenceAccess(std::string name, size_t ops, size_t elem_size) {
+ TVector v;
+ for (const auto& item : GetTestKeys(elem_size)) {
+ v.push_back(item);
+ }
+ auto env = Env::Default();
+
+ auto ops_remaining = ops;
+ auto start_time = env->NowNanos();
+ size_t total = 0;
+ while (ops_remaining--) {
+ auto end = v.end();
+ for (auto pos = v.begin(); pos != end; ++pos) {
+ total += pos->size();
+ }
+ }
+ auto elapsed = env->NowNanos() - start_time;
+ cout << "performed " << ops << " sequence access against " << name << "\n\t"
+ << "size: " << elem_size << "\n\t"
+ << "total time elapsed: " << elapsed << " (ns)" << endl;
+ // HACK avoid compiler's optimization to ignore total
+ return total;
+}
+
+// This test case only reports the performance between std::vector<std::string>
+// and autovector<std::string>. We chose string for comparison because in most
+// of our use cases we used std::vector<std::string>.
+TEST_F(AutoVectorTest, PerfBench) {
+ // We run same operations for kOps times in order to get a more fair result.
+ size_t kOps = 100000;
+
+ // Creation and insertion test
+ // Test the case when there is:
+ // * no element inserted: internal array of std::vector may not really get
+ // initialize.
+ // * one element inserted: internal array of std::vector must have
+ // initialized.
+ // * kSize elements inserted. This shows the most time we'll spend if we
+ // keep everything in stack.
+ // * 2 * kSize elements inserted. The internal vector of
+ // autovector must have been initialized.
+ cout << "=====================================================" << endl;
+ cout << "Creation and Insertion Test (value type: std::string)" << endl;
+ cout << "=====================================================" << endl;
+
+ // pre-generated unique keys
+ auto string_keys = GetTestKeys(kOps * 2 * kSize);
+ for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+ BenchmarkVectorCreationAndInsertion<std::vector<std::string>>(
+ "std::vector<std::string>", kOps, insertions, string_keys);
+ BenchmarkVectorCreationAndInsertion<autovector<std::string, kSize>>(
+ "autovector<std::string>", kOps, insertions, string_keys);
+ cout << "-----------------------------------" << endl;
+ }
+
+ cout << "=====================================================" << endl;
+ cout << "Creation and Insertion Test (value type: uint64_t)" << endl;
+ cout << "=====================================================" << endl;
+
+ // pre-generated unique keys
+ std::vector<uint64_t> int_keys(kOps * 2 * kSize);
+ for (size_t i = 0; i < kOps * 2 * kSize; ++i) {
+ int_keys[i] = i;
+ }
+ for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+ BenchmarkVectorCreationAndInsertion<std::vector<uint64_t>>(
+ "std::vector<uint64_t>", kOps, insertions, int_keys);
+ BenchmarkVectorCreationAndInsertion<autovector<uint64_t, kSize>>(
+ "autovector<uint64_t>", kOps, insertions, int_keys
+ );
+ cout << "-----------------------------------" << endl;
+ }
+
+ // Sequence Access Test
+ cout << "=====================================================" << endl;
+ cout << "Sequence Access Test" << endl;
+ cout << "=====================================================" << endl;
+ for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) {
+ BenchmarkSequenceAccess<std::vector<std::string>>("std::vector", kOps,
+ elem_size);
+ BenchmarkSequenceAccess<autovector<std::string, kSize>>("autovector", kOps,
+ elem_size);
+ cout << "-----------------------------------" << endl;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/bloom_impl.h b/src/rocksdb/util/bloom_impl.h
new file mode 100644
index 000000000..54c048485
--- /dev/null
+++ b/src/rocksdb/util/bloom_impl.h
@@ -0,0 +1,483 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Implementation details of various Bloom filter implementations used in
+// RocksDB. (DynamicBloom is in a separate file for now because it
+// supports concurrent write.)
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include <cmath>
+
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+#ifdef HAVE_AVX2
+#include <immintrin.h>
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class BloomMath {
+ public:
+ // False positive rate of a standard Bloom filter, for given ratio of
+ // filter memory bits to added keys, and number of probes per operation.
+ // (The false positive rate is effectively independent of scale, assuming
+ // the implementation scales OK.)
+ static double StandardFpRate(double bits_per_key, int num_probes) {
+ // Standard very-good-estimate formula. See
+ // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
+ return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes);
+ }
+
+ // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter,
+ // for given ratio of filter memory bits to added keys, number of probes per
+ // operation (all within the given block or cache line size), and block or
+ // cache line size.
+ static double CacheLocalFpRate(double bits_per_key, int num_probes,
+ int cache_line_bits) {
+ double keys_per_cache_line = cache_line_bits / bits_per_key;
+ // A reasonable estimate is the average of the FP rates for one standard
+ // deviation above and below the mean bucket occupancy. See
+ // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math
+ double keys_stddev = std::sqrt(keys_per_cache_line);
+ double crowded_fp = StandardFpRate(
+ cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes);
+ double uncrowded_fp = StandardFpRate(
+ cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes);
+ return (crowded_fp + uncrowded_fp) / 2;
+ }
+
+ // False positive rate of querying a new item against `num_keys` items, all
+ // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes
+ // themselves are stored losslessly. See Section 4 of
+ // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
+ static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) {
+ double inv_fingerprint_space = std::pow(0.5, fingerprint_bits);
+ // Base estimate assumes each key maps to a unique fingerprint.
+ // Could be > 1 in extreme cases.
+ double base_estimate = num_keys * inv_fingerprint_space;
+ // To account for potential overlap, we choose between two formulas
+ if (base_estimate > 0.0001) {
+ // A very good formula assuming we don't construct a floating point
+ // number extremely close to 1. Always produces a probability < 1.
+ return 1.0 - std::exp(-base_estimate);
+ } else {
+ // A very good formula when base_estimate is far below 1. (Subtract
+ // away the integral-approximated sum that some key has same hash as
+ // one coming before it in a list.)
+ return base_estimate - (base_estimate * base_estimate * 0.5);
+ }
+ }
+
+ // Returns the probably of either of two independent(-ish) events
+ // happening, given their probabilities. (This is useful for combining
+ // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate
+ // for a hash-efficient Bloom filter's FP rate. See Section 4 of
+ // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
+ static double IndependentProbabilitySum(double rate1, double rate2) {
+ // Use formula that avoids floating point extremely close to 1 if
+ // rates are extremely small.
+ return rate1 + rate2 - (rate1 * rate2);
+ }
+};
+
+// A fast, flexible, and accurate cache-local Bloom implementation with
+// SIMD-optimized query performance (currently using AVX2 on Intel). Write
+// performance and non-SIMD read are very good, benefiting from fastrange32
+// used in place of % and single-cycle multiplication on recent processors.
+//
+// Most other SIMD Bloom implementations sacrifice flexibility and/or
+// accuracy by requiring num_probes to be a power of two and restricting
+// where each probe can occur in a cache line. This implementation sacrifices
+// SIMD-optimization for add (might still be possible, especially with AVX512)
+// in favor of allowing any num_probes, not crossing cache line boundary,
+// and accuracy close to theoretical best accuracy for a cache-local Bloom.
+// E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket
+// (Intel cache line size) is 0.9535% FP rate. This implementation yields
+// about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or
+// about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.)
+//
+// This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or
+// a 64-bit hash (split into two uint32s). With many millions of keys, the
+// false positive rate associated with using a 32-bit hash can dominate the
+// false positive rate of the underlying filter. At 10 bits/key setting, the
+// inflection point is about 40 million keys, so 32-bit hash is a bad idea
+// with 10s of millions of keys or more.
+//
+// Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange
+// to pick a cache line, which can be faster than 64-bit in some cases.
+// This only hurts accuracy as you get into 10s of GB for a single filter,
+// and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to
+// 64-bit fastrange if you need filters so big. ;)
+//
+// Using only a 32-bit input hash within each cache line has negligible
+// impact for any reasonable cache line / bucket size, for arbitrary filter
+// size, and potentially saves intermediate data size in some cases vs.
+// tracking full 64 bits. (Even in an implementation using 64-bit arithmetic
+// to generate indices, I might do the same, as a single multiplication
+// suffices to generate a sufficiently mixed 64 bits from 32 bits.)
+//
+// This implementation is currently tied to Intel cache line size, 64 bytes ==
+// 512 bits. If there's sufficient demand for other cache line sizes, this is
+// a pretty good implementation to extend, but slight performance enhancements
+// are possible with an alternate implementation (probably not very compatible
+// with SIMD):
+// (1) Use rotation in addition to multiplication for remixing
+// (like murmur hash). (Using multiplication alone *slightly* hurts accuracy
+// because lower bits never depend on original upper bits.)
+// (2) Extract more than one bit index from each re-mix. (Only if rotation
+// or similar is part of remix, because otherwise you're making the
+// multiplication-only problem worse.)
+// (3) Re-mix full 64 bit hash, to get maximum number of bit indices per
+// re-mix.
+//
+class FastLocalBloomImpl {
+ public:
+ // NOTE: this has only been validated to enough accuracy for producing
+ // reasonable warnings / user feedback, not for making functional decisions.
+ static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes,
+ int hash_bits) {
+ return BloomMath::IndependentProbabilitySum(
+ BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes,
+ /*cache line bits*/ 512),
+ BloomMath::FingerprintFpRate(keys, hash_bits));
+ }
+
+ static inline int ChooseNumProbes(int millibits_per_key) {
+ // Since this implementation can (with AVX2) make up to 8 probes
+ // for the same cost, we pick the most accurate num_probes, based
+ // on actual tests of the implementation. Note that for higher
+ // bits/key, the best choice for cache-local Bloom can be notably
+ // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k.
+ if (millibits_per_key <= 2080) {
+ return 1;
+ } else if (millibits_per_key <= 3580) {
+ return 2;
+ } else if (millibits_per_key <= 5100) {
+ return 3;
+ } else if (millibits_per_key <= 6640) {
+ return 4;
+ } else if (millibits_per_key <= 8300) {
+ return 5;
+ } else if (millibits_per_key <= 10070) {
+ return 6;
+ } else if (millibits_per_key <= 11720) {
+ return 7;
+ } else if (millibits_per_key <= 14001) {
+ // Would be something like <= 13800 but sacrificing *slightly* for
+ // more settings using <= 8 probes.
+ return 8;
+ } else if (millibits_per_key <= 16050) {
+ return 9;
+ } else if (millibits_per_key <= 18300) {
+ return 10;
+ } else if (millibits_per_key <= 22001) {
+ return 11;
+ } else if (millibits_per_key <= 25501) {
+ return 12;
+ } else if (millibits_per_key > 50000) {
+ // Top out at 24 probes (three sets of 8)
+ return 24;
+ } else {
+ // Roughly optimal choices for remaining range
+ // e.g.
+ // 28000 -> 12, 28001 -> 13
+ // 50000 -> 23, 50001 -> 24
+ return (millibits_per_key - 1) / 2000 - 1;
+ }
+ }
+
+ static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+ int num_probes, char *data) {
+ uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+ AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
+ }
+
+ static inline void AddHashPrepared(uint32_t h2, int num_probes,
+ char *data_at_cache_line) {
+ uint32_t h = h2;
+ for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+ // 9-bit address within 512 bit cache line
+ int bitpos = h >> (32 - 9);
+ data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7));
+ }
+ }
+
+ static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
+ const char *data,
+ uint32_t /*out*/ *byte_offset) {
+ uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+ PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
+ PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
+ *byte_offset = bytes_to_cache_line;
+ }
+
+ static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+ int num_probes, const char *data) {
+ uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+ return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
+ }
+
+ static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
+ const char *data_at_cache_line) {
+ uint32_t h = h2;
+#ifdef HAVE_AVX2
+ int rem_probes = num_probes;
+
+ // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18,
+ // etc.} one can insert specialized code for rem_probes <= 2, bypassing
+ // the SIMD code in those cases. There is a detectable but minor overhead
+ // applied to other values of num_probes (when not statically determined),
+ // but smoother performance curve vs. num_probes. But for now, when
+ // in doubt, don't add unnecessary code.
+
+ // Powers of 32-bit golden ratio, mod 2**32.
+ const __m256i multipliers =
+ _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9,
+ 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749);
+
+ for (;;) {
+ // Eight copies of hash
+ __m256i hash_vector = _mm256_set1_epi32(h);
+
+ // Same effect as repeated multiplication by 0x9e3779b9 thanks to
+ // associativity of multiplication.
+ hash_vector = _mm256_mullo_epi32(hash_vector, multipliers);
+
+ // Now the top 9 bits of each of the eight 32-bit values in
+ // hash_vector are bit addresses for probes within the cache line.
+ // While the platform-independent code uses byte addressing (6 bits
+ // to pick a byte + 3 bits to pick a bit within a byte), here we work
+ // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit
+ // within a word) because that works well with AVX2 and is equivalent
+ // under little-endian.
+
+ // Shift each right by 28 bits to get 4-bit word addresses.
+ const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28);
+
+ // Gather 32-bit values spread over 512 bits by 4-bit address. In
+ // essence, we are dereferencing eight pointers within the cache
+ // line.
+ //
+ // Option 1: AVX2 gather (seems to be a little slow - understandable)
+ // const __m256i value_vector =
+ // _mm256_i32gather_epi32(static_cast<const int
+ // *>(data_at_cache_line),
+ // word_addresses,
+ // /*bytes / i32*/ 4);
+ // END Option 1
+ // Potentially unaligned as we're not *always* cache-aligned -> loadu
+ const __m256i *mm_data =
+ reinterpret_cast<const __m256i *>(data_at_cache_line);
+ __m256i lower = _mm256_loadu_si256(mm_data);
+ __m256i upper = _mm256_loadu_si256(mm_data + 1);
+ // Option 2: AVX512VL permute hack
+ // Only negligibly faster than Option 3, so not yet worth supporting
+ // const __m256i value_vector =
+ // _mm256_permutex2var_epi32(lower, word_addresses, upper);
+ // END Option 2
+ // Option 3: AVX2 permute+blend hack
+ // Use lowest three bits to order probing values, as if all from same
+ // 256 bit piece.
+ lower = _mm256_permutevar8x32_epi32(lower, word_addresses);
+ upper = _mm256_permutevar8x32_epi32(upper, word_addresses);
+ // Just top 1 bit of address, to select between lower and upper.
+ const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31);
+ // Finally: the next 8 probed 32-bit values, in probing sequence order.
+ const __m256i value_vector =
+ _mm256_blendv_epi8(lower, upper, upper_lower_selector);
+ // END Option 3
+
+ // We might not need to probe all 8, so build a mask for selecting only
+ // what we need. (The k_selector(s) could be pre-computed but that
+ // doesn't seem to make a noticeable performance difference.)
+ const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ // Subtract rem_probes from each of those constants
+ __m256i k_selector =
+ _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes));
+ // Negative after subtract -> use/select
+ // Keep only high bit (logical shift right each by 31).
+ k_selector = _mm256_srli_epi32(k_selector, 31);
+
+ // Strip off the 4 bit word address (shift left)
+ __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4);
+ // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses.
+ bit_addresses = _mm256_srli_epi32(bit_addresses, 27);
+ // Build a bit mask
+ const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses);
+
+ // Like ((~value_vector) & bit_mask) == 0)
+ bool match = _mm256_testc_si256(value_vector, bit_mask) != 0;
+
+ // This check first so that it's easy for branch predictor to optimize
+ // num_probes <= 8 case, making it free of unpredictable branches.
+ if (rem_probes <= 8) {
+ return match;
+ } else if (!match) {
+ return false;
+ }
+ // otherwise
+ // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power
+ h *= 0xab25f4c1;
+ rem_probes -= 8;
+ }
+#else
+ for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+ // 9-bit address within 512 bit cache line
+ int bitpos = h >> (32 - 9);
+ if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) {
+ return false;
+ }
+ }
+ return true;
+#endif
+ }
+};
+
+// A legacy Bloom filter implementation with no locality of probes (slow).
+// It uses double hashing to generate a sequence of hash values.
+// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
+// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
+//
+// DO NOT REUSE
+//
+class LegacyNoLocalityBloomImpl {
+ public:
+ static inline int ChooseNumProbes(int bits_per_key) {
+ // We intentionally round down to reduce probing cost a little bit
+ int num_probes = static_cast<int>(bits_per_key * 0.69); // 0.69 =~ ln(2)
+ if (num_probes < 1) num_probes = 1;
+ if (num_probes > 30) num_probes = 30;
+ return num_probes;
+ }
+
+ static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
+ char *data) {
+ const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
+ for (int i = 0; i < num_probes; i++) {
+ const uint32_t bitpos = h % total_bits;
+ data[bitpos / 8] |= (1 << (bitpos % 8));
+ h += delta;
+ }
+ }
+
+ static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
+ int num_probes, const char *data) {
+ const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
+ for (int i = 0; i < num_probes; i++) {
+ const uint32_t bitpos = h % total_bits;
+ if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
+ return false;
+ }
+ h += delta;
+ }
+ return true;
+ }
+};
+
+// A legacy Bloom filter implementation with probes local to a single
+// cache line (fast). Because SST files might be transported between
+// platforms, the cache line size is a parameter rather than hard coded.
+// (But if specified as a constant parameter, an optimizing compiler
+// should take advantage of that.)
+//
+// When ExtraRotates is false, this implementation is notably deficient in
+// accuracy. Specifically, it uses double hashing with a 1/512 chance of the
+// increment being zero (when cache line size is 512 bits). Thus, there's a
+// 1/512 chance of probing only one index, which we'd expect to incur about
+// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
+// https://github.com/facebook/rocksdb/issues/4120
+//
+// DO NOT REUSE
+//
+template <bool ExtraRotates>
+class LegacyLocalityBloomImpl {
+ private:
+ static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
+ uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
+ return offset_h % num_lines;
+ }
+
+ public:
+ // NOTE: this has only been validated to enough accuracy for producing
+ // reasonable warnings / user feedback, not for making functional decisions.
+ static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) {
+ double bits_per_key = 8.0 * bytes / keys;
+ double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes,
+ /*cache line bits*/ 512);
+ if (!ExtraRotates) {
+ // Good estimate of impact of flaw in index computation.
+ // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key.
+ // The + 22 shifts it nicely to fit for lower bits/key.
+ filter_rate += 0.1 / (bits_per_key * 0.75 + 22);
+ } else {
+ // Not yet validated
+ assert(false);
+ }
+ // Always uses 32-bit hash
+ double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32);
+ return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate);
+ }
+
+ static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
+ char *data, int log2_cache_line_bytes) {
+ const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+ char *data_at_offset =
+ data + (GetLine(h, num_lines) << log2_cache_line_bytes);
+ const uint32_t delta = (h >> 17) | (h << 15);
+ for (int i = 0; i < num_probes; ++i) {
+ // Mask to bit-within-cache-line address
+ const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+ data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
+ if (ExtraRotates) {
+ h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+ }
+ h += delta;
+ }
+ }
+
+ static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
+ const char *data,
+ uint32_t /*out*/ *byte_offset,
+ int log2_cache_line_bytes) {
+ uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+ PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
+ PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */,
+ 1 /* locality */);
+ *byte_offset = b;
+ }
+
+ static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
+ int num_probes, const char *data,
+ int log2_cache_line_bytes) {
+ uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+ return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
+ }
+
+ static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
+ const char *data_at_offset,
+ int log2_cache_line_bytes) {
+ const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+ const uint32_t delta = (h >> 17) | (h << 15);
+ for (int i = 0; i < num_probes; ++i) {
+ // Mask to bit-within-cache-line address
+ const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+ if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+ return false;
+ }
+ if (ExtraRotates) {
+ h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+ }
+ h += delta;
+ }
+ return true;
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/bloom_test.cc b/src/rocksdb/util/bloom_test.cc
new file mode 100644
index 000000000..c88d7ee32
--- /dev/null
+++ b/src/rocksdb/util/bloom_test.cc
@@ -0,0 +1,912 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+ return 0;
+}
+#else
+
+#include <array>
+#include <cmath>
+#include <vector>
+
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(bits_per_key, 10, "");
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kVerbose = 1;
+
+static Slice Key(int i, char* buffer) {
+ std::string s;
+ PutFixed32(&s, static_cast<uint32_t>(i));
+ memcpy(buffer, s.c_str(), sizeof(i));
+ return Slice(buffer, sizeof(i));
+}
+
+static int NextLength(int length) {
+ if (length < 10) {
+ length += 1;
+ } else if (length < 100) {
+ length += 10;
+ } else if (length < 1000) {
+ length += 100;
+ } else {
+ length += 1000;
+ }
+ return length;
+}
+
+class BlockBasedBloomTest : public testing::Test {
+ private:
+ std::unique_ptr<const FilterPolicy> policy_;
+ std::string filter_;
+ std::vector<std::string> keys_;
+
+ public:
+ BlockBasedBloomTest() { ResetPolicy(); }
+
+ void Reset() {
+ keys_.clear();
+ filter_.clear();
+ }
+
+ void ResetPolicy(double bits_per_key) {
+ policy_.reset(new BloomFilterPolicy(bits_per_key,
+ BloomFilterPolicy::kDeprecatedBlock));
+ Reset();
+ }
+
+ void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
+
+ void Add(const Slice& s) {
+ keys_.push_back(s.ToString());
+ }
+
+ void Build() {
+ std::vector<Slice> key_slices;
+ for (size_t i = 0; i < keys_.size(); i++) {
+ key_slices.push_back(Slice(keys_[i]));
+ }
+ filter_.clear();
+ policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()),
+ &filter_);
+ keys_.clear();
+ if (kVerbose >= 2) DumpFilter();
+ }
+
+ size_t FilterSize() const {
+ return filter_.size();
+ }
+
+ Slice FilterData() const { return Slice(filter_); }
+
+ void DumpFilter() {
+ fprintf(stderr, "F(");
+ for (size_t i = 0; i+1 < filter_.size(); i++) {
+ const unsigned int c = static_cast<unsigned int>(filter_[i]);
+ for (int j = 0; j < 8; j++) {
+ fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
+ }
+ }
+ fprintf(stderr, ")\n");
+ }
+
+ bool Matches(const Slice& s) {
+ if (!keys_.empty()) {
+ Build();
+ }
+ return policy_->KeyMayMatch(s, filter_);
+ }
+
+ double FalsePositiveRate() {
+ char buffer[sizeof(int)];
+ int result = 0;
+ for (int i = 0; i < 10000; i++) {
+ if (Matches(Key(i + 1000000000, buffer))) {
+ result++;
+ }
+ }
+ return result / 10000.0;
+ }
+};
+
+TEST_F(BlockBasedBloomTest, EmptyFilter) {
+ ASSERT_TRUE(! Matches("hello"));
+ ASSERT_TRUE(! Matches("world"));
+}
+
+TEST_F(BlockBasedBloomTest, Small) {
+ Add("hello");
+ Add("world");
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+ ASSERT_TRUE(! Matches("x"));
+ ASSERT_TRUE(! Matches("foo"));
+}
+
+TEST_F(BlockBasedBloomTest, VaryingLengths) {
+ char buffer[sizeof(int)];
+
+ // Count number of filters that significantly exceed the false positive rate
+ int mediocre_filters = 0;
+ int good_filters = 0;
+
+ for (int length = 1; length <= 10000; length = NextLength(length)) {
+ Reset();
+ for (int i = 0; i < length; i++) {
+ Add(Key(i, buffer));
+ }
+ Build();
+
+ ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
+
+ // All added keys must match
+ for (int i = 0; i < length; i++) {
+ ASSERT_TRUE(Matches(Key(i, buffer)))
+ << "Length " << length << "; key " << i;
+ }
+
+ // Check false positive rate
+ double rate = FalsePositiveRate();
+ if (kVerbose >= 1) {
+ fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+ rate*100.0, length, static_cast<int>(FilterSize()));
+ }
+ ASSERT_LE(rate, 0.02); // Must not be over 2%
+ if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often
+ else good_filters++;
+ }
+ if (kVerbose >= 1) {
+ fprintf(stderr, "Filters: %d good, %d mediocre\n",
+ good_filters, mediocre_filters);
+ }
+ ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+// Ensure the implementation doesn't accidentally change in an
+// incompatible way
+TEST_F(BlockBasedBloomTest, Schema) {
+ char buffer[sizeof(int)];
+
+ ResetPolicy(8); // num_probes = 5
+ for (int key = 0; key < 87; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
+
+ ResetPolicy(9); // num_probes = 6
+ for (int key = 0; key < 87; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), 969445585);
+
+ ResetPolicy(11); // num_probes = 7
+ for (int key = 0; key < 87; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), 1694458207);
+
+ ResetPolicy(10); // num_probes = 6
+ for (int key = 0; key < 87; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), 2373646410U);
+
+ ResetPolicy(10);
+ for (int key = /*CHANGED*/ 1; key < 87; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), 1908442116);
+
+ ResetPolicy(10);
+ for (int key = 1; key < /*CHANGED*/ 88; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), 3057004015U);
+
+ // With new fractional bits_per_key, check that we are rounding to
+ // whole bits per key for old Bloom filters.
+ ResetPolicy(9.5); // Treated as 10
+ for (int key = 1; key < 88; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
+
+ ResetPolicy(10.499); // Treated as 10
+ for (int key = 1; key < 88; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
+
+ ResetPolicy();
+}
+
+// Different bits-per-byte
+
+class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
+ private:
+ BlockBasedTableOptions table_options_;
+ std::shared_ptr<const FilterPolicy>& policy_;
+ std::unique_ptr<FilterBitsBuilder> bits_builder_;
+ std::unique_ptr<FilterBitsReader> bits_reader_;
+ std::unique_ptr<const char[]> buf_;
+ size_t filter_size_;
+
+ public:
+ FullBloomTest() : policy_(table_options_.filter_policy), filter_size_(0) {
+ ResetPolicy();
+ }
+
+ BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
+ // Throws on bad cast
+ return &dynamic_cast<BuiltinFilterBitsBuilder&>(*bits_builder_);
+ }
+
+ const BloomFilterPolicy* GetBloomFilterPolicy() {
+ // Throws on bad cast
+ return &dynamic_cast<const BloomFilterPolicy&>(*policy_);
+ }
+
+ void Reset() {
+ bits_builder_.reset(BloomFilterPolicy::GetBuilderFromContext(
+ FilterBuildingContext(table_options_)));
+ bits_reader_.reset(nullptr);
+ buf_.reset(nullptr);
+ filter_size_ = 0;
+ }
+
+ void ResetPolicy(double bits_per_key) {
+ policy_.reset(new BloomFilterPolicy(bits_per_key, GetParam()));
+ Reset();
+ }
+
+ void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
+
+ void Add(const Slice& s) {
+ bits_builder_->AddKey(s);
+ }
+
+ void OpenRaw(const Slice& s) {
+ bits_reader_.reset(policy_->GetFilterBitsReader(s));
+ }
+
+ void Build() {
+ Slice filter = bits_builder_->Finish(&buf_);
+ bits_reader_.reset(policy_->GetFilterBitsReader(filter));
+ filter_size_ = filter.size();
+ }
+
+ size_t FilterSize() const {
+ return filter_size_;
+ }
+
+ Slice FilterData() { return Slice(buf_.get(), filter_size_); }
+
+ int GetNumProbesFromFilterData() {
+ assert(filter_size_ >= 5);
+ int8_t raw_num_probes = static_cast<int8_t>(buf_.get()[filter_size_ - 5]);
+ if (raw_num_probes == -1) { // New bloom filter marker
+ return static_cast<uint8_t>(buf_.get()[filter_size_ - 3]);
+ } else {
+ return raw_num_probes;
+ }
+ }
+
+ bool Matches(const Slice& s) {
+ if (bits_reader_ == nullptr) {
+ Build();
+ }
+ return bits_reader_->MayMatch(s);
+ }
+
+ // Provides a kind of fingerprint on the Bloom filter's
+ // behavior, for reasonbly high FP rates.
+ uint64_t PackedMatches() {
+ char buffer[sizeof(int)];
+ uint64_t result = 0;
+ for (int i = 0; i < 64; i++) {
+ if (Matches(Key(i + 12345, buffer))) {
+ result |= uint64_t{1} << i;
+ }
+ }
+ return result;
+ }
+
+ // Provides a kind of fingerprint on the Bloom filter's
+ // behavior, for lower FP rates.
+ std::string FirstFPs(int count) {
+ char buffer[sizeof(int)];
+ std::string rv;
+ int fp_count = 0;
+ for (int i = 0; i < 1000000; i++) {
+ // Pack four match booleans into each hexadecimal digit
+ if (Matches(Key(i + 1000000, buffer))) {
+ ++fp_count;
+ rv += std::to_string(i);
+ if (fp_count == count) {
+ break;
+ }
+ rv += ',';
+ }
+ }
+ return rv;
+ }
+
+ double FalsePositiveRate() {
+ char buffer[sizeof(int)];
+ int result = 0;
+ for (int i = 0; i < 10000; i++) {
+ if (Matches(Key(i + 1000000000, buffer))) {
+ result++;
+ }
+ }
+ return result / 10000.0;
+ }
+
+ uint32_t SelectByImpl(uint32_t for_legacy_bloom,
+ uint32_t for_fast_local_bloom) {
+ switch (GetParam()) {
+ case BloomFilterPolicy::kLegacyBloom:
+ return for_legacy_bloom;
+ case BloomFilterPolicy::kFastLocalBloom:
+ return for_fast_local_bloom;
+ case BloomFilterPolicy::kDeprecatedBlock:
+ case BloomFilterPolicy::kAuto:
+ /* N/A */;
+ }
+ // otherwise
+ assert(false);
+ return 0;
+ }
+};
+
+TEST_P(FullBloomTest, FilterSize) {
+ // In addition to checking the consistency of space computation, we are
+ // checking that denoted and computed doubles are interpreted as expected
+ // as bits_per_key values.
+ bool some_computed_less_than_denoted = false;
+ // Note: enforced minimum is 1 bit per key (1000 millibits), and enforced
+ // maximum is 100 bits per key (100000 millibits).
+ for (auto bpk :
+ std::vector<std::pair<double, int> >{{-HUGE_VAL, 1000},
+ {-INFINITY, 1000},
+ {0.0, 1000},
+ {1.234, 1234},
+ {3.456, 3456},
+ {9.5, 9500},
+ {10.0, 10000},
+ {10.499, 10499},
+ {21.345, 21345},
+ {99.999, 99999},
+ {1234.0, 100000},
+ {HUGE_VAL, 100000},
+ {INFINITY, 100000},
+ {NAN, 100000}}) {
+ ResetPolicy(bpk.first);
+ auto bfp = GetBloomFilterPolicy();
+ EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+ EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+ double computed = bpk.first;
+ // This transforms e.g. 9.5 -> 9.499999999999998, which we still
+ // round to 10 for whole bits per key.
+ computed += 0.5;
+ computed /= 1234567.0;
+ computed *= 1234567.0;
+ computed -= 0.5;
+ some_computed_less_than_denoted |= (computed < bpk.first);
+ ResetPolicy(computed);
+ bfp = GetBloomFilterPolicy();
+ EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+ EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+ auto bits_builder = GetBuiltinFilterBitsBuilder();
+ for (int n = 1; n < 100; n++) {
+ auto space = bits_builder->CalculateSpace(n);
+ auto n2 = bits_builder->CalculateNumEntry(space);
+ EXPECT_GE(n2, n);
+ auto space2 = bits_builder->CalculateSpace(n2);
+ EXPECT_EQ(space, space2);
+ }
+ }
+ // Check that the compiler hasn't optimized our computation into nothing
+ EXPECT_TRUE(some_computed_less_than_denoted);
+ ResetPolicy();
+}
+
+TEST_P(FullBloomTest, FullEmptyFilter) {
+ // Empty filter is not match, at this level
+ ASSERT_TRUE(!Matches("hello"));
+ ASSERT_TRUE(!Matches("world"));
+}
+
+TEST_P(FullBloomTest, FullSmall) {
+ Add("hello");
+ Add("world");
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+ ASSERT_TRUE(!Matches("x"));
+ ASSERT_TRUE(!Matches("foo"));
+}
+
+TEST_P(FullBloomTest, FullVaryingLengths) {
+ char buffer[sizeof(int)];
+
+ // Count number of filters that significantly exceed the false positive rate
+ int mediocre_filters = 0;
+ int good_filters = 0;
+
+ for (int length = 1; length <= 10000; length = NextLength(length)) {
+ Reset();
+ for (int i = 0; i < length; i++) {
+ Add(Key(i, buffer));
+ }
+ Build();
+
+ ASSERT_LE(FilterSize(),
+ (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
+
+ // All added keys must match
+ for (int i = 0; i < length; i++) {
+ ASSERT_TRUE(Matches(Key(i, buffer)))
+ << "Length " << length << "; key " << i;
+ }
+
+ // Check false positive rate
+ double rate = FalsePositiveRate();
+ if (kVerbose >= 1) {
+ fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+ rate*100.0, length, static_cast<int>(FilterSize()));
+ }
+ ASSERT_LE(rate, 0.02); // Must not be over 2%
+ if (rate > 0.0125)
+ mediocre_filters++; // Allowed, but not too often
+ else
+ good_filters++;
+ }
+ if (kVerbose >= 1) {
+ fprintf(stderr, "Filters: %d good, %d mediocre\n",
+ good_filters, mediocre_filters);
+ }
+ ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+namespace {
+inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128,
+ uint32_t for256) {
+ (void)for64;
+ (void)for128;
+ (void)for256;
+#if CACHE_LINE_SIZE == 64
+ return for64;
+#elif CACHE_LINE_SIZE == 128
+ return for128;
+#elif CACHE_LINE_SIZE == 256
+ return for256;
+#else
+#error "CACHE_LINE_SIZE unknown or unrecognized"
+#endif
+}
+} // namespace
+
+// Ensure the implementation doesn't accidentally change in an
+// incompatible way. This test doesn't check the reading side
+// (FirstFPs/PackedMatches) for LegacyBloom because it requires the
+// ability to read filters generated using other cache line sizes.
+// See RawSchema.
+TEST_P(FullBloomTest, Schema) {
+ char buffer[sizeof(int)];
+
+ // Use enough keys so that changing bits / key by 1 is guaranteed to
+ // change number of allocated cache lines. So keys > max cache line bits.
+
+ ResetPolicy(2); // num_probes = 1
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 1);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(1567096579, 1964771444, 2659542661U),
+ 3817481309U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
+ }
+
+ ResetPolicy(3); // num_probes = 2
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 2);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(2707206547U, 2571983456U, 218344685),
+ 2807269961U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
+ }
+
+ ResetPolicy(5); // num_probes = 3
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 3);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(515748486, 94611728, 2436112214U),
+ 204628445));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
+ }
+
+ ResetPolicy(8); // num_probes = 5
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 5);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(1302145999, 2811644657U, 756553699),
+ 355564975));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
+ }
+
+ ResetPolicy(9); // num_probes = 6
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(2092755149, 661139132, 1182970461),
+ 2137566013U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("156,367,791,872,945,1015,1139,1159,1265,1435", FirstFPs(10));
+ }
+
+ ResetPolicy(11); // num_probes = 7
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 7);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(3755609649U, 1812694762, 1449142939),
+ 2561502687U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
+ }
+
+ // This used to be 9 probes, but 8 is a better choice for speed,
+ // especially with SIMD groups of 8 probes, with essentially no
+ // change in FP rate.
+ // FP rate @ 9 probes, old Bloom: 0.4321%
+ // FP rate @ 9 probes, new Bloom: 0.1846%
+ // FP rate @ 8 probes, new Bloom: 0.1843%
+ ResetPolicy(14); // num_probes = 8 (new), 9 (old)
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(9, 8));
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(178861123, 379087593, 2574136516U),
+ 3709876890U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
+ }
+
+ // This used to be 11 probes, but 9 is a better choice for speed
+ // AND accuracy.
+ // FP rate @ 11 probes, old Bloom: 0.3571%
+ // FP rate @ 11 probes, new Bloom: 0.0884%
+ // FP rate @ 9 probes, new Bloom: 0.0843%
+ ResetPolicy(16); // num_probes = 9 (new), 11 (old)
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(11, 9));
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(1129406313, 3049154394U, 1727750964),
+ 1087138490));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("3299,3611,3916,6620,7822,8079,8482,8942,10167", FirstFPs(9));
+ }
+
+ ResetPolicy(10); // num_probes = 6, but different memory ratio vs. 9
+ for (int key = 0; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(1478976371, 2910591341U, 1182970461),
+ 2498541272U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+ }
+
+ ResetPolicy(10);
+ for (int key = /*CHANGED*/ 1; key < 2087; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U),
+ 2058382345U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+ }
+
+ ResetPolicy(10);
+ for (int key = 1; key < /*CHANGED*/ 2088; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+ EXPECT_EQ(
+ BloomHash(FilterData()),
+ SelectByImpl(SelectByCacheLineSize(2885052954U, 769447944, 4175124908U),
+ 23699164));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+ }
+
+ // With new fractional bits_per_key, check that we are rounding to
+ // whole bits per key for old Bloom filters but fractional for
+ // new Bloom filter.
+ ResetPolicy(9.5);
+ for (int key = 1; key < 2088; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+ EXPECT_EQ(BloomHash(FilterData()),
+ SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
+ 4175124908U),
+ /*CHANGED*/ 3166884174U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ(/*CHANGED*/ "126,156,367,444,458,791,813,976,1015,1035",
+ FirstFPs(10));
+ }
+
+ ResetPolicy(10.499);
+ for (int key = 1; key < 2088; key++) {
+ Add(Key(key, buffer));
+ }
+ Build();
+ EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(6, 7));
+ EXPECT_EQ(BloomHash(FilterData()),
+ SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
+ 4175124908U),
+ /*CHANGED*/ 4098502778U));
+ if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+ EXPECT_EQ(/*CHANGED*/ "16,236,240,472,1015,1045,1111,1409,1465,1612",
+ FirstFPs(10));
+ }
+
+ ResetPolicy();
+}
+
+// A helper class for testing custom or corrupt filter bits as read by
+// built-in FilterBitsReaders.
+struct RawFilterTester {
+ // Buffer, from which we always return a tail Slice, so the
+ // last five bytes are always the metadata bytes.
+ std::array<char, 3000> data_;
+ // Points five bytes from the end
+ char* metadata_ptr_;
+
+ RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {}
+
+ Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines,
+ uint32_t num_probes) {
+ metadata_ptr_[0] = static_cast<char>(num_probes);
+ EncodeFixed32(metadata_ptr_ + 1, num_lines);
+ uint32_t len = len_without_metadata + /*metadata*/ 5;
+ assert(len <= data_.size());
+ return Slice(metadata_ptr_ - len_without_metadata, len);
+ }
+
+ Slice Reset(uint32_t len_without_metadata, uint32_t num_lines,
+ uint32_t num_probes, bool fill_ones) {
+ data_.fill(fill_ones ? 0xff : 0);
+ return ResetNoFill(len_without_metadata, num_lines, num_probes);
+ }
+
+ Slice ResetWeirdFill(uint32_t len_without_metadata, uint32_t num_lines,
+ uint32_t num_probes) {
+ for (uint32_t i = 0; i < data_.size(); ++i) {
+ data_[i] = static_cast<char>(0x7b7b >> (i % 7));
+ }
+ return ResetNoFill(len_without_metadata, num_lines, num_probes);
+ }
+};
+
+TEST_P(FullBloomTest, RawSchema) {
+ RawFilterTester cft;
+ // Two probes, about 3/4 bits set: ~50% "FP" rate
+ // One 256-byte cache line.
+ OpenRaw(cft.ResetWeirdFill(256, 1, 2));
+ EXPECT_EQ(uint64_t{11384799501900898790U}, PackedMatches());
+
+ // Two 128-byte cache lines.
+ OpenRaw(cft.ResetWeirdFill(256, 2, 2));
+ EXPECT_EQ(uint64_t{10157853359773492589U}, PackedMatches());
+
+ // Four 64-byte cache lines.
+ OpenRaw(cft.ResetWeirdFill(256, 4, 2));
+ EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
+}
+
+TEST_P(FullBloomTest, CorruptFilters) {
+ RawFilterTester cft;
+
+ for (bool fill : {false, true}) {
+ // Good filter bits - returns same as fill
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill));
+ ASSERT_EQ(fill, Matches("hello"));
+ ASSERT_EQ(fill, Matches("world"));
+
+ // Good filter bits - returns same as fill
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE * 3, 3, 6, fill));
+ ASSERT_EQ(fill, Matches("hello"));
+ ASSERT_EQ(fill, Matches("world"));
+
+ // Good filter bits - returns same as fill
+ // 256 is unusual but legal cache line size
+ OpenRaw(cft.Reset(256 * 3, 3, 6, fill));
+ ASSERT_EQ(fill, Matches("hello"));
+ ASSERT_EQ(fill, Matches("world"));
+
+ // Good filter bits - returns same as fill
+ // 30 should be max num_probes
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 30, fill));
+ ASSERT_EQ(fill, Matches("hello"));
+ ASSERT_EQ(fill, Matches("world"));
+
+ // Good filter bits - returns same as fill
+ // 1 should be min num_probes
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 1, fill));
+ ASSERT_EQ(fill, Matches("hello"));
+ ASSERT_EQ(fill, Matches("world"));
+
+ // Type 1 trivial filter bits - returns true as if FP by zero probes
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 0, fill));
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+
+ // Type 2 trivial filter bits - returns false as if built from zero keys
+ OpenRaw(cft.Reset(0, 0, 6, fill));
+ ASSERT_FALSE(Matches("hello"));
+ ASSERT_FALSE(Matches("world"));
+
+ // Type 2 trivial filter bits - returns false as if built from zero keys
+ OpenRaw(cft.Reset(0, 37, 6, fill));
+ ASSERT_FALSE(Matches("hello"));
+ ASSERT_FALSE(Matches("world"));
+
+ // Type 2 trivial filter bits - returns false as 0 size trumps 0 probes
+ OpenRaw(cft.Reset(0, 0, 0, fill));
+ ASSERT_FALSE(Matches("hello"));
+ ASSERT_FALSE(Matches("world"));
+
+ // Bad filter bits - returns true for safety
+ // No solution to 0 * x == CACHE_LINE_SIZE
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 0, 6, fill));
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+
+ // Bad filter bits - returns true for safety
+ // Can't have 3 * x == 4 for integer x
+ OpenRaw(cft.Reset(4, 3, 6, fill));
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+
+ // Bad filter bits - returns true for safety
+ // 97 bytes is not a power of two, so not a legal cache line size
+ OpenRaw(cft.Reset(97 * 3, 3, 6, fill));
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+
+ // Bad filter bits - returns true for safety
+ // 65 bytes is not a power of two, so not a legal cache line size
+ OpenRaw(cft.Reset(65 * 3, 3, 6, fill));
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+
+ // Bad filter bits - returns false as if built from zero keys
+ // < 5 bytes overall means missing even metadata
+ OpenRaw(cft.Reset(-1, 3, 6, fill));
+ ASSERT_FALSE(Matches("hello"));
+ ASSERT_FALSE(Matches("world"));
+
+ OpenRaw(cft.Reset(-5, 3, 6, fill));
+ ASSERT_FALSE(Matches("hello"));
+ ASSERT_FALSE(Matches("world"));
+
+ // Dubious filter bits - returns same as fill (for now)
+ // 31 is not a useful num_probes, nor generated by RocksDB unless directly
+ // using filter bits API without BloomFilterPolicy.
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 31, fill));
+ ASSERT_EQ(fill, Matches("hello"));
+ ASSERT_EQ(fill, Matches("world"));
+
+ // Dubious filter bits - returns same as fill (for now)
+ // Similar, with 127, largest positive char
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 127, fill));
+ ASSERT_EQ(fill, Matches("hello"));
+ ASSERT_EQ(fill, Matches("world"));
+
+ // Dubious filter bits - returns true (for now)
+ // num_probes set to 128 / -128, lowest negative char
+ // NB: Bug in implementation interprets this as negative and has same
+ // effect as zero probes, but effectively reserves negative char values
+ // for future use.
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 128, fill));
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+
+ // Dubious filter bits - returns true (for now)
+ // Similar, with 255 / -1
+ OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 255, fill));
+ ASSERT_TRUE(Matches("hello"));
+ ASSERT_TRUE(Matches("world"));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
+ testing::Values(BloomFilterPolicy::kLegacyBloom,
+ BloomFilterPolicy::kFastLocalBloom));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ return RUN_ALL_TESTS();
+}
+
+#endif // GFLAGS
diff --git a/src/rocksdb/util/build_version.cc.in b/src/rocksdb/util/build_version.cc.in
new file mode 100644
index 000000000..856958018
--- /dev/null
+++ b/src/rocksdb/util/build_version.cc.in
@@ -0,0 +1,5 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "build_version.h"
+const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:@@GIT_SHA@@";
+const char* rocksdb_build_git_date = "rocksdb_build_git_date:@@GIT_DATE_TIME@@";
+const char* rocksdb_build_compile_date = __DATE__;
diff --git a/src/rocksdb/util/build_version.h b/src/rocksdb/util/build_version.h
new file mode 100644
index 000000000..36ff92c07
--- /dev/null
+++ b/src/rocksdb/util/build_version.h
@@ -0,0 +1,15 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#if !defined(IOS_CROSS_COMPILE)
+// if we compile with Xcode, we don't run build_detect_version, so we don't
+// generate these variables
+// this variable tells us about the git revision
+extern const char* rocksdb_build_git_sha;
+
+// Date on which the code was compiled:
+extern const char* rocksdb_build_compile_date;
+#endif
diff --git a/src/rocksdb/util/cast_util.h b/src/rocksdb/util/cast_util.h
new file mode 100644
index 000000000..d7d9a9e9c
--- /dev/null
+++ b/src/rocksdb/util/cast_util.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace ROCKSDB_NAMESPACE {
+// The helper function to assert the move from dynamic_cast<> to
+// static_cast<> is correct. This function is to deal with legacy code.
+// It is not recommanded to add new code to issue class casting. The preferred
+// solution is to implement the functionality without a need of casting.
+template <class DestClass, class SrcClass>
+inline DestClass* static_cast_with_check(SrcClass* x) {
+ DestClass* ret = static_cast<DestClass*>(x);
+#ifdef ROCKSDB_USE_RTTI
+ assert(ret == dynamic_cast<DestClass*>(x));
+#endif
+ return ret;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/channel.h b/src/rocksdb/util/channel.h
new file mode 100644
index 000000000..705fa2d28
--- /dev/null
+++ b/src/rocksdb/util/channel.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <utility>
+
+namespace ROCKSDB_NAMESPACE {
+
+template <class T>
+class channel {
+ public:
+ explicit channel() : eof_(false) {}
+
+ channel(const channel&) = delete;
+ void operator=(const channel&) = delete;
+
+ void sendEof() {
+ std::lock_guard<std::mutex> lk(lock_);
+ eof_ = true;
+ cv_.notify_all();
+ }
+
+ bool eof() {
+ std::lock_guard<std::mutex> lk(lock_);
+ return buffer_.empty() && eof_;
+ }
+
+ size_t size() const {
+ std::lock_guard<std::mutex> lk(lock_);
+ return buffer_.size();
+ }
+
+ // writes elem to the queue
+ void write(T&& elem) {
+ std::unique_lock<std::mutex> lk(lock_);
+ buffer_.emplace(std::forward<T>(elem));
+ cv_.notify_one();
+ }
+
+ /// Moves a dequeued element onto elem, blocking until an element
+ /// is available.
+ // returns false if EOF
+ bool read(T& elem) {
+ std::unique_lock<std::mutex> lk(lock_);
+ cv_.wait(lk, [&] { return eof_ || !buffer_.empty(); });
+ if (eof_ && buffer_.empty()) {
+ return false;
+ }
+ elem = std::move(buffer_.front());
+ buffer_.pop();
+ cv_.notify_one();
+ return true;
+ }
+
+ private:
+ std::condition_variable cv_;
+ mutable std::mutex lock_;
+ std::queue<T> buffer_;
+ bool eof_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coding.cc b/src/rocksdb/util/coding.cc
new file mode 100644
index 000000000..a54324d28
--- /dev/null
+++ b/src/rocksdb/util/coding.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include <algorithm>
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// conversion' conversion from 'type1' to 'type2', possible loss of data
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4244)
+#endif
+char* EncodeVarint32(char* dst, uint32_t v) {
+ // Operate on characters as unsigneds
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+ static const int B = 128;
+ if (v < (1 << 7)) {
+ *(ptr++) = v;
+ } else if (v < (1 << 14)) {
+ *(ptr++) = v | B;
+ *(ptr++) = v >> 7;
+ } else if (v < (1 << 21)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v >> 7) | B;
+ *(ptr++) = v >> 14;
+ } else if (v < (1 << 28)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v >> 7) | B;
+ *(ptr++) = (v >> 14) | B;
+ *(ptr++) = v >> 21;
+ } else {
+ *(ptr++) = v | B;
+ *(ptr++) = (v >> 7) | B;
+ *(ptr++) = (v >> 14) | B;
+ *(ptr++) = (v >> 21) | B;
+ *(ptr++) = v >> 28;
+ }
+ return reinterpret_cast<char*>(ptr);
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+const char* GetVarint32PtrFallback(const char* p, const char* limit,
+ uint32_t* value) {
+ uint32_t result = 0;
+ for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
+ uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
+ p++;
+ if (byte & 128) {
+ // More bytes are present
+ result |= ((byte & 127) << shift);
+ } else {
+ result |= (byte << shift);
+ *value = result;
+ return reinterpret_cast<const char*>(p);
+ }
+ }
+ return nullptr;
+}
+
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
+ uint64_t result = 0;
+ for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
+ uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
+ p++;
+ if (byte & 128) {
+ // More bytes are present
+ result |= ((byte & 127) << shift);
+ } else {
+ result |= (byte << shift);
+ *value = result;
+ return reinterpret_cast<const char*>(p);
+ }
+ }
+ return nullptr;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coding.h b/src/rocksdb/util/coding.h
new file mode 100644
index 000000000..d46654ec4
--- /dev/null
+++ b/src/rocksdb/util/coding.h
@@ -0,0 +1,480 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#pragma once
+#include <algorithm>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+
+#include "rocksdb/write_batch.h"
+#include "port/port.h"
+
+// Some processors does not allow unaligned access to memory
+#if defined(__sparc)
+ #define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// The maximum length of a varint in bytes for 64-bit.
+const unsigned int kMaxVarint64Length = 10;
+
+// Standard Put... routines append to a string
+extern void PutFixed16(std::string* dst, uint16_t value);
+extern void PutFixed32(std::string* dst, uint32_t value);
+extern void PutFixed64(std::string* dst, uint64_t value);
+extern void PutVarint32(std::string* dst, uint32_t value);
+extern void PutVarint32Varint32(std::string* dst, uint32_t value1,
+ uint32_t value2);
+extern void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1,
+ uint32_t value2, uint32_t value3);
+extern void PutVarint64(std::string* dst, uint64_t value);
+extern void PutVarint64Varint64(std::string* dst, uint64_t value1,
+ uint64_t value2);
+extern void PutVarint32Varint64(std::string* dst, uint32_t value1,
+ uint64_t value2);
+extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1,
+ uint32_t value2, uint64_t value3);
+extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+extern void PutLengthPrefixedSliceParts(std::string* dst,
+ const SliceParts& slice_parts);
+extern void PutLengthPrefixedSlicePartsWithPadding(
+ std::string* dst, const SliceParts& slice_parts, size_t pad_sz);
+
+// Standard Get... routines parse a value from the beginning of a Slice
+// and advance the slice past the parsed value.
+extern bool GetFixed64(Slice* input, uint64_t* value);
+extern bool GetFixed32(Slice* input, uint32_t* value);
+extern bool GetFixed16(Slice* input, uint16_t* value);
+extern bool GetVarint32(Slice* input, uint32_t* value);
+extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetVarsignedint64(Slice* input, int64_t* value);
+extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+// This function assumes data is well-formed.
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+extern Slice GetSliceUntil(Slice* slice, char delimiter);
+
+// Borrowed from
+// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208
+constexpr inline uint64_t i64ToZigzag(const int64_t l) {
+ return (static_cast<uint64_t>(l) << 1) ^ static_cast<uint64_t>(l >> 63);
+}
+inline int64_t zigzagToI64(uint64_t n) {
+ return (n >> 1) ^ -static_cast<int64_t>(n & 1);
+}
+
+// Pointer-based variants of GetVarint... These either store a value
+// in *v and return a pointer just past the parsed value, or return
+// nullptr on error. These routines only look at bytes in the range
+// [p..limit-1]
+extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
+extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
+ int64_t* value) {
+ uint64_t u = 0;
+ const char* ret = GetVarint64Ptr(p, limit, &u);
+ *value = zigzagToI64(u);
+ return ret;
+}
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed16(char* dst, uint16_t value);
+extern void EncodeFixed32(char* dst, uint32_t value);
+extern void EncodeFixed64(char* dst, uint64_t value);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
+// REQUIRES: dst has enough space for the value being written
+extern char* EncodeVarint32(char* dst, uint32_t value);
+extern char* EncodeVarint64(char* dst, uint64_t value);
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint16_t DecodeFixed16(const char* ptr) {
+ if (port::kLittleEndian) {
+ // Load the raw bytes
+ uint16_t result;
+ memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+ return result;
+ } else {
+ return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
+ (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
+ }
+}
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+ if (port::kLittleEndian) {
+ // Load the raw bytes
+ uint32_t result;
+ memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+ return result;
+ } else {
+ return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
+ | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
+ | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
+ | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+ }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+ if (port::kLittleEndian) {
+ // Load the raw bytes
+ uint64_t result;
+ memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+ return result;
+ } else {
+ uint64_t lo = DecodeFixed32(ptr);
+ uint64_t hi = DecodeFixed32(ptr + 4);
+ return (hi << 32) | lo;
+ }
+}
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p,
+ const char* limit,
+ uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p,
+ const char* limit,
+ uint32_t* value) {
+ if (p < limit) {
+ uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+ if ((result & 128) == 0) {
+ *value = result;
+ return p + 1;
+ }
+ }
+ return GetVarint32PtrFallback(p, limit, value);
+}
+
+// -- Implementation of the functions declared above
+inline void EncodeFixed16(char* buf, uint16_t value) {
+ if (port::kLittleEndian) {
+ memcpy(buf, &value, sizeof(value));
+ } else {
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ }
+}
+
+inline void EncodeFixed32(char* buf, uint32_t value) {
+ if (port::kLittleEndian) {
+ memcpy(buf, &value, sizeof(value));
+ } else {
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+ }
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+ if (port::kLittleEndian) {
+ memcpy(buf, &value, sizeof(value));
+ } else {
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+ buf[4] = (value >> 32) & 0xff;
+ buf[5] = (value >> 40) & 0xff;
+ buf[6] = (value >> 48) & 0xff;
+ buf[7] = (value >> 56) & 0xff;
+ }
+}
+
+// Pull the last 8 bits and cast it to a character
+inline void PutFixed16(std::string* dst, uint16_t value) {
+ if (port::kLittleEndian) {
+ dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+ sizeof(value));
+ } else {
+ char buf[sizeof(value)];
+ EncodeFixed16(buf, value);
+ dst->append(buf, sizeof(buf));
+ }
+}
+
+inline void PutFixed32(std::string* dst, uint32_t value) {
+ if (port::kLittleEndian) {
+ dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+ sizeof(value));
+ } else {
+ char buf[sizeof(value)];
+ EncodeFixed32(buf, value);
+ dst->append(buf, sizeof(buf));
+ }
+}
+
+inline void PutFixed64(std::string* dst, uint64_t value) {
+ if (port::kLittleEndian) {
+ dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+ sizeof(value));
+ } else {
+ char buf[sizeof(value)];
+ EncodeFixed64(buf, value);
+ dst->append(buf, sizeof(buf));
+ }
+}
+
+inline void PutVarint32(std::string* dst, uint32_t v) {
+ char buf[5];
+ char* ptr = EncodeVarint32(buf, v);
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32(std::string* dst, uint32_t v1, uint32_t v2) {
+ char buf[10];
+ char* ptr = EncodeVarint32(buf, v1);
+ ptr = EncodeVarint32(ptr, v2);
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1,
+ uint32_t v2, uint32_t v3) {
+ char buf[15];
+ char* ptr = EncodeVarint32(buf, v1);
+ ptr = EncodeVarint32(ptr, v2);
+ ptr = EncodeVarint32(ptr, v3);
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline char* EncodeVarint64(char* dst, uint64_t v) {
+ static const unsigned int B = 128;
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+ while (v >= B) {
+ *(ptr++) = (v & (B - 1)) | B;
+ v >>= 7;
+ }
+ *(ptr++) = static_cast<unsigned char>(v);
+ return reinterpret_cast<char*>(ptr);
+}
+
+inline void PutVarint64(std::string* dst, uint64_t v) {
+ char buf[kMaxVarint64Length];
+ char* ptr = EncodeVarint64(buf, v);
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarsignedint64(std::string* dst, int64_t v) {
+ char buf[kMaxVarint64Length];
+ // Using Zigzag format to convert signed to unsigned
+ char* ptr = EncodeVarint64(buf, i64ToZigzag(v));
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) {
+ char buf[20];
+ char* ptr = EncodeVarint64(buf, v1);
+ ptr = EncodeVarint64(ptr, v2);
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint64(std::string* dst, uint32_t v1, uint64_t v2) {
+ char buf[15];
+ char* ptr = EncodeVarint32(buf, v1);
+ ptr = EncodeVarint64(ptr, v2);
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32Varint64(std::string* dst, uint32_t v1,
+ uint32_t v2, uint64_t v3) {
+ char buf[20];
+ char* ptr = EncodeVarint32(buf, v1);
+ ptr = EncodeVarint32(ptr, v2);
+ ptr = EncodeVarint64(ptr, v3);
+ dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+ PutVarint32(dst, static_cast<uint32_t>(value.size()));
+ dst->append(value.data(), value.size());
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes,
+ const SliceParts& slice_parts) {
+ for (int i = 0; i < slice_parts.num_parts; ++i) {
+ total_bytes += slice_parts.parts[i].size();
+ }
+ PutVarint32(dst, static_cast<uint32_t>(total_bytes));
+ for (int i = 0; i < slice_parts.num_parts; ++i) {
+ dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+ }
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+ const SliceParts& slice_parts) {
+ PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts);
+}
+
+inline void PutLengthPrefixedSlicePartsWithPadding(
+ std::string* dst, const SliceParts& slice_parts, size_t pad_sz) {
+ PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts);
+ dst->append(pad_sz, '\0');
+}
+
+inline int VarintLength(uint64_t v) {
+ int len = 1;
+ while (v >= 128) {
+ v >>= 7;
+ len++;
+ }
+ return len;
+}
+
+inline bool GetFixed64(Slice* input, uint64_t* value) {
+ if (input->size() < sizeof(uint64_t)) {
+ return false;
+ }
+ *value = DecodeFixed64(input->data());
+ input->remove_prefix(sizeof(uint64_t));
+ return true;
+}
+
+inline bool GetFixed32(Slice* input, uint32_t* value) {
+ if (input->size() < sizeof(uint32_t)) {
+ return false;
+ }
+ *value = DecodeFixed32(input->data());
+ input->remove_prefix(sizeof(uint32_t));
+ return true;
+}
+
+inline bool GetFixed16(Slice* input, uint16_t* value) {
+ if (input->size() < sizeof(uint16_t)) {
+ return false;
+ }
+ *value = DecodeFixed16(input->data());
+ input->remove_prefix(sizeof(uint16_t));
+ return true;
+}
+
+inline bool GetVarint32(Slice* input, uint32_t* value) {
+ const char* p = input->data();
+ const char* limit = p + input->size();
+ const char* q = GetVarint32Ptr(p, limit, value);
+ if (q == nullptr) {
+ return false;
+ } else {
+ *input = Slice(q, static_cast<size_t>(limit - q));
+ return true;
+ }
+}
+
+inline bool GetVarint64(Slice* input, uint64_t* value) {
+ const char* p = input->data();
+ const char* limit = p + input->size();
+ const char* q = GetVarint64Ptr(p, limit, value);
+ if (q == nullptr) {
+ return false;
+ } else {
+ *input = Slice(q, static_cast<size_t>(limit - q));
+ return true;
+ }
+}
+
+inline bool GetVarsignedint64(Slice* input, int64_t* value) {
+ const char* p = input->data();
+ const char* limit = p + input->size();
+ const char* q = GetVarsignedint64Ptr(p, limit, value);
+ if (q == nullptr) {
+ return false;
+ } else {
+ *input = Slice(q, static_cast<size_t>(limit - q));
+ return true;
+ }
+}
+
+// Provide an interface for platform independent endianness transformation
+inline uint64_t EndianTransform(uint64_t input, size_t size) {
+ char* pos = reinterpret_cast<char*>(&input);
+ uint64_t ret_val = 0;
+ for (size_t i = 0; i < size; ++i) {
+ ret_val |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
+ << ((size - i - 1) << 3));
+ }
+ return ret_val;
+}
+
+inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+ uint32_t len = 0;
+ if (GetVarint32(input, &len) && input->size() >= len) {
+ *result = Slice(input->data(), len);
+ input->remove_prefix(len);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+inline Slice GetLengthPrefixedSlice(const char* data) {
+ uint32_t len = 0;
+ // +5: we assume "data" is not corrupted
+ // unsigned char is 7 bits, uint32_t is 32 bits, need 5 unsigned char
+ auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
+ return Slice(p, len);
+}
+
+inline Slice GetSliceUntil(Slice* slice, char delimiter) {
+ uint32_t len = 0;
+ for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+ // nothing
+ }
+
+ Slice ret(slice->data(), len);
+ slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+ return ret;
+}
+
+template<class T>
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void PutUnaligned(T *memory, const T &value) {
+#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
+ char *nonAlignedMemory = reinterpret_cast<char*>(memory);
+ memcpy(nonAlignedMemory, reinterpret_cast<const char*>(&value), sizeof(T));
+#else
+ *memory = value;
+#endif
+}
+
+template<class T>
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void GetUnaligned(const T *memory, T *value) {
+#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
+ char *nonAlignedMemory = reinterpret_cast<char*>(value);
+ memcpy(nonAlignedMemory, reinterpret_cast<const char*>(memory), sizeof(T));
+#else
+ *value = *memory;
+#endif
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coding_test.cc b/src/rocksdb/util/coding_test.cc
new file mode 100644
index 000000000..383e3f514
--- /dev/null
+++ b/src/rocksdb/util/coding_test.cc
@@ -0,0 +1,217 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Coding { };
+TEST(Coding, Fixed16) {
+ std::string s;
+ for (uint16_t v = 0; v < 0xFFFF; v++) {
+ PutFixed16(&s, v);
+ }
+
+ const char* p = s.data();
+ for (uint16_t v = 0; v < 0xFFFF; v++) {
+ uint16_t actual = DecodeFixed16(p);
+ ASSERT_EQ(v, actual);
+ p += sizeof(uint16_t);
+ }
+}
+
+TEST(Coding, Fixed32) {
+ std::string s;
+ for (uint32_t v = 0; v < 100000; v++) {
+ PutFixed32(&s, v);
+ }
+
+ const char* p = s.data();
+ for (uint32_t v = 0; v < 100000; v++) {
+ uint32_t actual = DecodeFixed32(p);
+ ASSERT_EQ(v, actual);
+ p += sizeof(uint32_t);
+ }
+}
+
+TEST(Coding, Fixed64) {
+ std::string s;
+ for (int power = 0; power <= 63; power++) {
+ uint64_t v = static_cast<uint64_t>(1) << power;
+ PutFixed64(&s, v - 1);
+ PutFixed64(&s, v + 0);
+ PutFixed64(&s, v + 1);
+ }
+
+ const char* p = s.data();
+ for (int power = 0; power <= 63; power++) {
+ uint64_t v = static_cast<uint64_t>(1) << power;
+ uint64_t actual = 0;
+ actual = DecodeFixed64(p);
+ ASSERT_EQ(v-1, actual);
+ p += sizeof(uint64_t);
+
+ actual = DecodeFixed64(p);
+ ASSERT_EQ(v+0, actual);
+ p += sizeof(uint64_t);
+
+ actual = DecodeFixed64(p);
+ ASSERT_EQ(v+1, actual);
+ p += sizeof(uint64_t);
+ }
+}
+
+// Test that encoding routines generate little-endian encodings
+TEST(Coding, EncodingOutput) {
+ std::string dst;
+ PutFixed32(&dst, 0x04030201);
+ ASSERT_EQ(4U, dst.size());
+ ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+ ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+ ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+ ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+
+ dst.clear();
+ PutFixed64(&dst, 0x0807060504030201ull);
+ ASSERT_EQ(8U, dst.size());
+ ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+ ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+ ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+ ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+ ASSERT_EQ(0x05, static_cast<int>(dst[4]));
+ ASSERT_EQ(0x06, static_cast<int>(dst[5]));
+ ASSERT_EQ(0x07, static_cast<int>(dst[6]));
+ ASSERT_EQ(0x08, static_cast<int>(dst[7]));
+}
+
+TEST(Coding, Varint32) {
+ std::string s;
+ for (uint32_t i = 0; i < (32 * 32); i++) {
+ uint32_t v = (i / 32) << (i % 32);
+ PutVarint32(&s, v);
+ }
+
+ const char* p = s.data();
+ const char* limit = p + s.size();
+ for (uint32_t i = 0; i < (32 * 32); i++) {
+ uint32_t expected = (i / 32) << (i % 32);
+ uint32_t actual = 0;
+ const char* start = p;
+ p = GetVarint32Ptr(p, limit, &actual);
+ ASSERT_TRUE(p != nullptr);
+ ASSERT_EQ(expected, actual);
+ ASSERT_EQ(VarintLength(actual), p - start);
+ }
+ ASSERT_EQ(p, s.data() + s.size());
+}
+
+TEST(Coding, Varint64) {
+ // Construct the list of values to check
+ std::vector<uint64_t> values;
+ // Some special values
+ values.push_back(0);
+ values.push_back(100);
+ values.push_back(~static_cast<uint64_t>(0));
+ values.push_back(~static_cast<uint64_t>(0) - 1);
+ for (uint32_t k = 0; k < 64; k++) {
+ // Test values near powers of two
+ const uint64_t power = 1ull << k;
+ values.push_back(power);
+ values.push_back(power-1);
+ values.push_back(power+1);
+ };
+
+ std::string s;
+ for (unsigned int i = 0; i < values.size(); i++) {
+ PutVarint64(&s, values[i]);
+ }
+
+ const char* p = s.data();
+ const char* limit = p + s.size();
+ for (unsigned int i = 0; i < values.size(); i++) {
+ ASSERT_TRUE(p < limit);
+ uint64_t actual = 0;
+ const char* start = p;
+ p = GetVarint64Ptr(p, limit, &actual);
+ ASSERT_TRUE(p != nullptr);
+ ASSERT_EQ(values[i], actual);
+ ASSERT_EQ(VarintLength(actual), p - start);
+ }
+ ASSERT_EQ(p, limit);
+
+}
+
+TEST(Coding, Varint32Overflow) {
+ uint32_t result;
+ std::string input("\x81\x82\x83\x84\x85\x11");
+ ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result)
+ == nullptr);
+}
+
+TEST(Coding, Varint32Truncation) {
+ uint32_t large_value = (1u << 31) + 100;
+ std::string s;
+ PutVarint32(&s, large_value);
+ uint32_t result;
+ for (unsigned int len = 0; len < s.size() - 1; len++) {
+ ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
+ }
+ ASSERT_TRUE(
+ GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+ ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Varint64Overflow) {
+ uint64_t result;
+ std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+ ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result)
+ == nullptr);
+}
+
+TEST(Coding, Varint64Truncation) {
+ uint64_t large_value = (1ull << 63) + 100ull;
+ std::string s;
+ PutVarint64(&s, large_value);
+ uint64_t result;
+ for (unsigned int len = 0; len < s.size() - 1; len++) {
+ ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
+ }
+ ASSERT_TRUE(
+ GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+ ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Strings) {
+ std::string s;
+ PutLengthPrefixedSlice(&s, Slice(""));
+ PutLengthPrefixedSlice(&s, Slice("foo"));
+ PutLengthPrefixedSlice(&s, Slice("bar"));
+ PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x')));
+
+ Slice input(s);
+ Slice v;
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ("", v.ToString());
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ("foo", v.ToString());
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ("bar", v.ToString());
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ(std::string(200, 'x'), v.ToString());
+ ASSERT_EQ("", input.ToString());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/compaction_job_stats_impl.cc b/src/rocksdb/util/compaction_job_stats_impl.cc
new file mode 100644
index 000000000..4d16891e4
--- /dev/null
+++ b/src/rocksdb/util/compaction_job_stats_impl.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/compaction_job_stats.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+void CompactionJobStats::Reset() {
+ elapsed_micros = 0;
+ cpu_micros = 0;
+
+ num_input_records = 0;
+ num_input_files = 0;
+ num_input_files_at_output_level = 0;
+
+ num_output_records = 0;
+ num_output_files = 0;
+
+ is_manual_compaction = 0;
+
+ total_input_bytes = 0;
+ total_output_bytes = 0;
+
+ num_records_replaced = 0;
+
+ total_input_raw_key_bytes = 0;
+ total_input_raw_value_bytes = 0;
+
+ num_input_deletion_records = 0;
+ num_expired_deletion_records = 0;
+
+ num_corrupt_keys = 0;
+
+ file_write_nanos = 0;
+ file_range_sync_nanos = 0;
+ file_fsync_nanos = 0;
+ file_prepare_write_nanos = 0;
+
+ smallest_output_key_prefix.clear();
+ largest_output_key_prefix.clear();
+
+ num_single_del_fallthru = 0;
+ num_single_del_mismatch = 0;
+}
+
+void CompactionJobStats::Add(const CompactionJobStats& stats) {
+ elapsed_micros += stats.elapsed_micros;
+ cpu_micros += stats.cpu_micros;
+
+ num_input_records += stats.num_input_records;
+ num_input_files += stats.num_input_files;
+ num_input_files_at_output_level += stats.num_input_files_at_output_level;
+
+ num_output_records += stats.num_output_records;
+ num_output_files += stats.num_output_files;
+
+ total_input_bytes += stats.total_input_bytes;
+ total_output_bytes += stats.total_output_bytes;
+
+ num_records_replaced += stats.num_records_replaced;
+
+ total_input_raw_key_bytes += stats.total_input_raw_key_bytes;
+ total_input_raw_value_bytes += stats.total_input_raw_value_bytes;
+
+ num_input_deletion_records += stats.num_input_deletion_records;
+ num_expired_deletion_records += stats.num_expired_deletion_records;
+
+ num_corrupt_keys += stats.num_corrupt_keys;
+
+ file_write_nanos += stats.file_write_nanos;
+ file_range_sync_nanos += stats.file_range_sync_nanos;
+ file_fsync_nanos += stats.file_fsync_nanos;
+ file_prepare_write_nanos += stats.file_prepare_write_nanos;
+
+ num_single_del_fallthru += stats.num_single_del_fallthru;
+ num_single_del_mismatch += stats.num_single_del_mismatch;
+}
+
+#else
+
+void CompactionJobStats::Reset() {}
+
+void CompactionJobStats::Add(const CompactionJobStats& /*stats*/) {}
+
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/comparator.cc b/src/rocksdb/util/comparator.cc
new file mode 100644
index 000000000..48340bd96
--- /dev/null
+++ b/src/rocksdb/util/comparator.cc
@@ -0,0 +1,216 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/comparator.h"
+#include <stdint.h>
+#include <algorithm>
+#include <memory>
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+class BytewiseComparatorImpl : public Comparator {
+ public:
+ BytewiseComparatorImpl() { }
+
+ const char* Name() const override { return "leveldb.BytewiseComparator"; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ return a.compare(b);
+ }
+
+ bool Equal(const Slice& a, const Slice& b) const override { return a == b; }
+
+ void FindShortestSeparator(std::string* start,
+ const Slice& limit) const override {
+ // Find length of common prefix
+ size_t min_length = std::min(start->size(), limit.size());
+ size_t diff_index = 0;
+ while ((diff_index < min_length) &&
+ ((*start)[diff_index] == limit[diff_index])) {
+ diff_index++;
+ }
+
+ if (diff_index >= min_length) {
+ // Do not shorten if one string is a prefix of the other
+ } else {
+ uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]);
+ uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]);
+ if (start_byte >= limit_byte) {
+ // Cannot shorten since limit is smaller than start or start is
+ // already the shortest possible.
+ return;
+ }
+ assert(start_byte < limit_byte);
+
+ if (diff_index < limit.size() - 1 || start_byte + 1 < limit_byte) {
+ (*start)[diff_index]++;
+ start->resize(diff_index + 1);
+ } else {
+ // v
+ // A A 1 A A A
+ // A A 2
+ //
+ // Incrementing the current byte will make start bigger than limit, we
+ // will skip this byte, and find the first non 0xFF byte in start and
+ // increment it.
+ diff_index++;
+
+ while (diff_index < start->size()) {
+ // Keep moving until we find the first non 0xFF byte to
+ // increment it
+ if (static_cast<uint8_t>((*start)[diff_index]) <
+ static_cast<uint8_t>(0xff)) {
+ (*start)[diff_index]++;
+ start->resize(diff_index + 1);
+ break;
+ }
+ diff_index++;
+ }
+ }
+ assert(Compare(*start, limit) < 0);
+ }
+ }
+
+ void FindShortSuccessor(std::string* key) const override {
+ // Find first character that can be incremented
+ size_t n = key->size();
+ for (size_t i = 0; i < n; i++) {
+ const uint8_t byte = (*key)[i];
+ if (byte != static_cast<uint8_t>(0xff)) {
+ (*key)[i] = byte + 1;
+ key->resize(i+1);
+ return;
+ }
+ }
+ // *key is a run of 0xffs. Leave it alone.
+ }
+
+ bool IsSameLengthImmediateSuccessor(const Slice& s,
+ const Slice& t) const override {
+ if (s.size() != t.size() || s.size() == 0) {
+ return false;
+ }
+ size_t diff_ind = s.difference_offset(t);
+ // same slice
+ if (diff_ind >= s.size()) return false;
+ uint8_t byte_s = static_cast<uint8_t>(s[diff_ind]);
+ uint8_t byte_t = static_cast<uint8_t>(t[diff_ind]);
+ // first different byte must be consecutive, and remaining bytes must be
+ // 0xff for s and 0x00 for t
+ if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) {
+ for (size_t i = diff_ind + 1; i < s.size(); ++i) {
+ byte_s = static_cast<uint8_t>(s[i]);
+ byte_t = static_cast<uint8_t>(t[i]);
+ if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) {
+ return false;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ bool CanKeysWithDifferentByteContentsBeEqual() const override {
+ return false;
+ }
+
+ int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+ return a.compare(b);
+ }
+};
+
+class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
+ public:
+ ReverseBytewiseComparatorImpl() { }
+
+ const char* Name() const override {
+ return "rocksdb.ReverseBytewiseComparator";
+ }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ return -a.compare(b);
+ }
+
+ void FindShortestSeparator(std::string* start,
+ const Slice& limit) const override {
+ // Find length of common prefix
+ size_t min_length = std::min(start->size(), limit.size());
+ size_t diff_index = 0;
+ while ((diff_index < min_length) &&
+ ((*start)[diff_index] == limit[diff_index])) {
+ diff_index++;
+ }
+
+ assert(diff_index <= min_length);
+ if (diff_index == min_length) {
+ // Do not shorten if one string is a prefix of the other
+ //
+ // We could handle cases like:
+ // V
+ // A A 2 X Y
+ // A A 2
+ // in a similar way as BytewiseComparator::FindShortestSeparator().
+ // We keep it simple by not implementing it. We can come back to it
+ // later when needed.
+ } else {
+ uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]);
+ uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]);
+ if (start_byte > limit_byte && diff_index < start->size() - 1) {
+ // Case like
+ // V
+ // A A 3 A A
+ // A A 1 B B
+ //
+ // or
+ // v
+ // A A 2 A A
+ // A A 1 B B
+ // In this case "AA2" will be good.
+#ifndef NDEBUG
+ std::string old_start = *start;
+#endif
+ start->resize(diff_index + 1);
+#ifndef NDEBUG
+ assert(old_start >= *start);
+#endif
+ assert(Slice(*start).compare(limit) > 0);
+ }
+ }
+ }
+
+ void FindShortSuccessor(std::string* /*key*/) const override {
+ // Don't do anything for simplicity.
+ }
+
+ bool CanKeysWithDifferentByteContentsBeEqual() const override {
+ return false;
+ }
+
+ int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+ return -a.compare(b);
+ }
+};
+}// namespace
+
+const Comparator* BytewiseComparator() {
+ static BytewiseComparatorImpl bytewise;
+ return &bytewise;
+}
+
+const Comparator* ReverseBytewiseComparator() {
+ static ReverseBytewiseComparatorImpl rbytewise;
+ return &rbytewise;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/compression.h b/src/rocksdb/util/compression.h
new file mode 100644
index 000000000..8169841ba
--- /dev/null
+++ b/src/rocksdb/util/compression.h
@@ -0,0 +1,1407 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else // OS_FREEBSD
+#include <malloc.h>
+#endif // OS_FREEBSD
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+#include <string>
+
+#include "memory/memory_allocator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/string_util.h"
+
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#if defined(LZ4)
+#include <lz4.h>
+#include <lz4hc.h>
+#endif
+
+#if defined(ZSTD)
+#include <zstd.h>
+#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+
+#include <zdict.h>
+#endif // ZSTD_VERSION_NUMBER >= 10103
+namespace ROCKSDB_NAMESPACE {
+// Need this for the context allocation override
+// On windows we need to do this explicitly
+#if (ZSTD_VERSION_NUMBER >= 500)
+#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \
+ defined(ZSTD_STATIC_LINKING_ONLY)
+#define ROCKSDB_ZSTD_CUSTOM_MEM
+namespace port {
+ZSTD_customMem GetJeZstdAllocationOverrides();
+} // namespace port
+#endif // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) &&
+ // defined(ZSTD_STATIC_LINKING_ONLY)
+
+// We require `ZSTD_sizeof_DDict` and `ZSTD_createDDict_byReference` to use
+// `ZSTD_DDict`. The former was introduced in v1.0.0 and the latter was
+// introduced in v1.1.3. But an important bug fix for `ZSTD_sizeof_DDict` came
+// in v1.1.4, so that is the version we require. As of today's latest version
+// (v1.3.8), they are both still in the experimental API, which means they are
+// only exported when the compiler flag `ZSTD_STATIC_LINKING_ONLY` is set.
+#if defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+#define ROCKSDB_ZSTD_DDICT
+#endif // defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+
+// Cached data represents a portion that can be re-used
+// If, in the future we have more than one native context to
+// cache we can arrange this as a tuple
+class ZSTDUncompressCachedData {
+ public:
+ using ZSTDNativeContext = ZSTD_DCtx*;
+ ZSTDUncompressCachedData() {}
+ // Init from cache
+ ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete;
+ ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+ ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) ROCKSDB_NOEXCEPT
+ : ZSTDUncompressCachedData() {
+ *this = std::move(o);
+ }
+ ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o)
+ ROCKSDB_NOEXCEPT {
+ assert(zstd_ctx_ == nullptr);
+ std::swap(zstd_ctx_, o.zstd_ctx_);
+ std::swap(cache_idx_, o.cache_idx_);
+ return *this;
+ }
+ ZSTDNativeContext Get() const { return zstd_ctx_; }
+ int64_t GetCacheIndex() const { return cache_idx_; }
+ void CreateIfNeeded() {
+ if (zstd_ctx_ == nullptr) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+ zstd_ctx_ =
+ ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else // ROCKSDB_ZSTD_CUSTOM_MEM
+ zstd_ctx_ = ZSTD_createDCtx();
+#endif // ROCKSDB_ZSTD_CUSTOM_MEM
+ cache_idx_ = -1;
+ }
+ }
+ void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) {
+ zstd_ctx_ = o.zstd_ctx_;
+ cache_idx_ = idx;
+ }
+ ~ZSTDUncompressCachedData() {
+ if (zstd_ctx_ != nullptr && cache_idx_ == -1) {
+ ZSTD_freeDCtx(zstd_ctx_);
+ }
+ }
+
+ private:
+ ZSTDNativeContext zstd_ctx_ = nullptr;
+ int64_t cache_idx_ = -1; // -1 means this instance owns the context
+};
+#endif // (ZSTD_VERSION_NUMBER >= 500)
+} // namespace ROCKSDB_NAMESPACE
+#endif // ZSTD
+
+#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500)
+namespace ROCKSDB_NAMESPACE {
+class ZSTDUncompressCachedData {
+ void* padding; // unused
+ public:
+ using ZSTDNativeContext = void*;
+ ZSTDUncompressCachedData() {}
+ ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {}
+ ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+ ZSTDUncompressCachedData(ZSTDUncompressCachedData&&)
+ ROCKSDB_NOEXCEPT = default;
+ ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&)
+ ROCKSDB_NOEXCEPT = default;
+ ZSTDNativeContext Get() const { return nullptr; }
+ int64_t GetCacheIndex() const { return -1; }
+ void CreateIfNeeded() {}
+ void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {}
+ private:
+ void ignore_padding__() { padding = nullptr; }
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif
+
+#if defined(XPRESS)
+#include "port/xpress.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// Holds dictionary and related data, like ZSTD's digested compression
+// dictionary.
+struct CompressionDict {
+#if ZSTD_VERSION_NUMBER >= 700
+ ZSTD_CDict* zstd_cdict_ = nullptr;
+#endif // ZSTD_VERSION_NUMBER >= 700
+ std::string dict_;
+
+ public:
+#if ZSTD_VERSION_NUMBER >= 700
+ CompressionDict(std::string dict, CompressionType type, int level) {
+#else // ZSTD_VERSION_NUMBER >= 700
+ CompressionDict(std::string dict, CompressionType /*type*/, int /*level*/) {
+#endif // ZSTD_VERSION_NUMBER >= 700
+ dict_ = std::move(dict);
+#if ZSTD_VERSION_NUMBER >= 700
+ zstd_cdict_ = nullptr;
+ if (!dict_.empty() && (type == kZSTD || type == kZSTDNotFinalCompression)) {
+ if (level == CompressionOptions::kDefaultCompressionLevel) {
+ // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+ // https://github.com/facebook/zstd/issues/1148
+ level = 3;
+ }
+ // Should be safe (but slower) if below call fails as we'll use the
+ // raw dictionary to compress.
+ zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level);
+ assert(zstd_cdict_ != nullptr);
+ }
+#endif // ZSTD_VERSION_NUMBER >= 700
+ }
+
+ ~CompressionDict() {
+#if ZSTD_VERSION_NUMBER >= 700
+ size_t res = 0;
+ if (zstd_cdict_ != nullptr) {
+ res = ZSTD_freeCDict(zstd_cdict_);
+ }
+ assert(res == 0); // Last I checked they can't fail
+ (void)res; // prevent unused var warning
+#endif // ZSTD_VERSION_NUMBER >= 700
+ }
+
+#if ZSTD_VERSION_NUMBER >= 700
+ const ZSTD_CDict* GetDigestedZstdCDict() const { return zstd_cdict_; }
+#endif // ZSTD_VERSION_NUMBER >= 700
+
+ Slice GetRawDict() const { return dict_; }
+
+ static const CompressionDict& GetEmptyDict() {
+ static CompressionDict empty_dict{};
+ return empty_dict;
+ }
+
+ CompressionDict() = default;
+ // Disable copy/move
+ CompressionDict(const CompressionDict&) = delete;
+ CompressionDict& operator=(const CompressionDict&) = delete;
+ CompressionDict(CompressionDict&&) = delete;
+ CompressionDict& operator=(CompressionDict&&) = delete;
+};
+
+// Holds dictionary and related data, like ZSTD's digested uncompression
+// dictionary.
+struct UncompressionDict {
+ // Block containing the data for the compression dictionary in case the
+ // constructor that takes a string parameter is used.
+ std::string dict_;
+
+ // Block containing the data for the compression dictionary in case the
+ // constructor that takes a Slice parameter is used and the passed in
+ // CacheAllocationPtr is not nullptr.
+ CacheAllocationPtr allocation_;
+
+ // Slice pointing to the compression dictionary data. Can point to
+ // dict_, allocation_, or some other memory location, depending on how
+ // the object was constructed.
+ Slice slice_;
+
+#ifdef ROCKSDB_ZSTD_DDICT
+ // Processed version of the contents of slice_ for ZSTD compression.
+ ZSTD_DDict* zstd_ddict_ = nullptr;
+#endif // ROCKSDB_ZSTD_DDICT
+
+#ifdef ROCKSDB_ZSTD_DDICT
+ UncompressionDict(std::string dict, bool using_zstd)
+#else // ROCKSDB_ZSTD_DDICT
+ UncompressionDict(std::string dict, bool /* using_zstd */)
+#endif // ROCKSDB_ZSTD_DDICT
+ : dict_(std::move(dict)), slice_(dict_) {
+#ifdef ROCKSDB_ZSTD_DDICT
+ if (!slice_.empty() && using_zstd) {
+ zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
+ assert(zstd_ddict_ != nullptr);
+ }
+#endif // ROCKSDB_ZSTD_DDICT
+ }
+
+#ifdef ROCKSDB_ZSTD_DDICT
+ UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+ bool using_zstd)
+#else // ROCKSDB_ZSTD_DDICT
+ UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+ bool /* using_zstd */)
+#endif // ROCKSDB_ZSTD_DDICT
+ : allocation_(std::move(allocation)), slice_(std::move(slice)) {
+#ifdef ROCKSDB_ZSTD_DDICT
+ if (!slice_.empty() && using_zstd) {
+ zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
+ assert(zstd_ddict_ != nullptr);
+ }
+#endif // ROCKSDB_ZSTD_DDICT
+ }
+
+ UncompressionDict(UncompressionDict&& rhs)
+ : dict_(std::move(rhs.dict_)),
+ allocation_(std::move(rhs.allocation_)),
+ slice_(std::move(rhs.slice_))
+#ifdef ROCKSDB_ZSTD_DDICT
+ ,
+ zstd_ddict_(rhs.zstd_ddict_)
+#endif
+ {
+#ifdef ROCKSDB_ZSTD_DDICT
+ rhs.zstd_ddict_ = nullptr;
+#endif
+ }
+
+ ~UncompressionDict() {
+#ifdef ROCKSDB_ZSTD_DDICT
+ size_t res = 0;
+ if (zstd_ddict_ != nullptr) {
+ res = ZSTD_freeDDict(zstd_ddict_);
+ }
+ assert(res == 0); // Last I checked they can't fail
+ (void)res; // prevent unused var warning
+#endif // ROCKSDB_ZSTD_DDICT
+ }
+
+ UncompressionDict& operator=(UncompressionDict&& rhs) {
+ if (this == &rhs) {
+ return *this;
+ }
+
+ dict_ = std::move(rhs.dict_);
+ allocation_ = std::move(rhs.allocation_);
+ slice_ = std::move(rhs.slice_);
+
+#ifdef ROCKSDB_ZSTD_DDICT
+ zstd_ddict_ = rhs.zstd_ddict_;
+ rhs.zstd_ddict_ = nullptr;
+#endif
+
+ return *this;
+ }
+
+ // The object is self-contained if the string constructor is used, or the
+ // Slice constructor is invoked with a non-null allocation. Otherwise, it
+ // is the caller's responsibility to ensure that the underlying storage
+ // outlives this object.
+ bool own_bytes() const { return !dict_.empty() || allocation_; }
+
+ const Slice& GetRawDict() const { return slice_; }
+
+#ifdef ROCKSDB_ZSTD_DDICT
+ const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
+#endif // ROCKSDB_ZSTD_DDICT
+
+ static const UncompressionDict& GetEmptyDict() {
+ static UncompressionDict empty_dict{};
+ return empty_dict;
+ }
+
+ size_t ApproximateMemoryUsage() const {
+ size_t usage = sizeof(struct UncompressionDict);
+ usage += dict_.size();
+ if (allocation_) {
+ auto allocator = allocation_.get_deleter().allocator;
+ if (allocator) {
+ usage += allocator->UsableSize(allocation_.get(), slice_.size());
+ } else {
+ usage += slice_.size();
+ }
+ }
+#ifdef ROCKSDB_ZSTD_DDICT
+ usage += ZSTD_sizeof_DDict(zstd_ddict_);
+#endif // ROCKSDB_ZSTD_DDICT
+ return usage;
+ }
+
+ UncompressionDict() = default;
+ // Disable copy
+ UncompressionDict(const CompressionDict&) = delete;
+ UncompressionDict& operator=(const CompressionDict&) = delete;
+};
+
+class CompressionContext {
+ private:
+#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500)
+ ZSTD_CCtx* zstd_ctx_ = nullptr;
+ void CreateNativeContext(CompressionType type) {
+ if (type == kZSTD || type == kZSTDNotFinalCompression) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+ zstd_ctx_ =
+ ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else // ROCKSDB_ZSTD_CUSTOM_MEM
+ zstd_ctx_ = ZSTD_createCCtx();
+#endif // ROCKSDB_ZSTD_CUSTOM_MEM
+ }
+ }
+ void DestroyNativeContext() {
+ if (zstd_ctx_ != nullptr) {
+ ZSTD_freeCCtx(zstd_ctx_);
+ }
+ }
+
+ public:
+ // callable inside ZSTD_Compress
+ ZSTD_CCtx* ZSTDPreallocCtx() const {
+ assert(zstd_ctx_ != nullptr);
+ return zstd_ctx_;
+ }
+
+#else // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ private:
+ void CreateNativeContext(CompressionType /* type */) {}
+ void DestroyNativeContext() {}
+#endif // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ public:
+ explicit CompressionContext(CompressionType type) {
+ CreateNativeContext(type);
+ }
+ ~CompressionContext() { DestroyNativeContext(); }
+ CompressionContext(const CompressionContext&) = delete;
+ CompressionContext& operator=(const CompressionContext&) = delete;
+};
+
+class CompressionInfo {
+ const CompressionOptions& opts_;
+ const CompressionContext& context_;
+ const CompressionDict& dict_;
+ const CompressionType type_;
+ const uint64_t sample_for_compression_;
+
+ public:
+ CompressionInfo(const CompressionOptions& _opts,
+ const CompressionContext& _context,
+ const CompressionDict& _dict, CompressionType _type,
+ uint64_t _sample_for_compression)
+ : opts_(_opts),
+ context_(_context),
+ dict_(_dict),
+ type_(_type),
+ sample_for_compression_(_sample_for_compression) {}
+
+ const CompressionOptions& options() const { return opts_; }
+ const CompressionContext& context() const { return context_; }
+ const CompressionDict& dict() const { return dict_; }
+ CompressionType type() const { return type_; }
+ uint64_t SampleForCompression() const { return sample_for_compression_; }
+};
+
+class UncompressionContext {
+ private:
+ CompressionContextCache* ctx_cache_ = nullptr;
+ ZSTDUncompressCachedData uncomp_cached_data_;
+
+ public:
+ struct NoCache {};
+ // Do not use context cache, used by TableBuilder
+ UncompressionContext(NoCache, CompressionType /* type */) {}
+
+ explicit UncompressionContext(CompressionType type) {
+ if (type == kZSTD || type == kZSTDNotFinalCompression) {
+ ctx_cache_ = CompressionContextCache::Instance();
+ uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData();
+ }
+ }
+ ~UncompressionContext() {
+ if (uncomp_cached_data_.GetCacheIndex() != -1) {
+ assert(ctx_cache_ != nullptr);
+ ctx_cache_->ReturnCachedZSTDUncompressData(
+ uncomp_cached_data_.GetCacheIndex());
+ }
+ }
+ UncompressionContext(const UncompressionContext&) = delete;
+ UncompressionContext& operator=(const UncompressionContext&) = delete;
+
+ ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const {
+ return uncomp_cached_data_.Get();
+ }
+};
+
+class UncompressionInfo {
+ const UncompressionContext& context_;
+ const UncompressionDict& dict_;
+ const CompressionType type_;
+
+ public:
+ UncompressionInfo(const UncompressionContext& _context,
+ const UncompressionDict& _dict, CompressionType _type)
+ : context_(_context), dict_(_dict), type_(_type) {}
+
+ const UncompressionContext& context() const { return context_; }
+ const UncompressionDict& dict() const { return dict_; }
+ CompressionType type() const { return type_; }
+};
+
+inline bool Snappy_Supported() {
+#ifdef SNAPPY
+ return true;
+#else
+ return false;
+#endif
+}
+
+inline bool Zlib_Supported() {
+#ifdef ZLIB
+ return true;
+#else
+ return false;
+#endif
+}
+
+inline bool BZip2_Supported() {
+#ifdef BZIP2
+ return true;
+#else
+ return false;
+#endif
+}
+
+inline bool LZ4_Supported() {
+#ifdef LZ4
+ return true;
+#else
+ return false;
+#endif
+}
+
+inline bool XPRESS_Supported() {
+#ifdef XPRESS
+ return true;
+#else
+ return false;
+#endif
+}
+
+inline bool ZSTD_Supported() {
+#ifdef ZSTD
+ // ZSTD format is finalized since version 0.8.0.
+ return (ZSTD_versionNumber() >= 800);
+#else
+ return false;
+#endif
+}
+
+inline bool ZSTDNotFinal_Supported() {
+#ifdef ZSTD
+ return true;
+#else
+ return false;
+#endif
+}
+
+inline bool CompressionTypeSupported(CompressionType compression_type) {
+ switch (compression_type) {
+ case kNoCompression:
+ return true;
+ case kSnappyCompression:
+ return Snappy_Supported();
+ case kZlibCompression:
+ return Zlib_Supported();
+ case kBZip2Compression:
+ return BZip2_Supported();
+ case kLZ4Compression:
+ return LZ4_Supported();
+ case kLZ4HCCompression:
+ return LZ4_Supported();
+ case kXpressCompression:
+ return XPRESS_Supported();
+ case kZSTDNotFinalCompression:
+ return ZSTDNotFinal_Supported();
+ case kZSTD:
+ return ZSTD_Supported();
+ default:
+ assert(false);
+ return false;
+ }
+}
+
+inline std::string CompressionTypeToString(CompressionType compression_type) {
+ switch (compression_type) {
+ case kNoCompression:
+ return "NoCompression";
+ case kSnappyCompression:
+ return "Snappy";
+ case kZlibCompression:
+ return "Zlib";
+ case kBZip2Compression:
+ return "BZip2";
+ case kLZ4Compression:
+ return "LZ4";
+ case kLZ4HCCompression:
+ return "LZ4HC";
+ case kXpressCompression:
+ return "Xpress";
+ case kZSTD:
+ return "ZSTD";
+ case kZSTDNotFinalCompression:
+ return "ZSTDNotFinal";
+ case kDisableCompressionOption:
+ return "DisableOption";
+ default:
+ assert(false);
+ return "";
+ }
+}
+
+inline std::string CompressionOptionsToString(
+ CompressionOptions& compression_options) {
+ std::string result;
+ result.reserve(512);
+ result.append("window_bits=")
+ .append(ToString(compression_options.window_bits))
+ .append("; ");
+ result.append("level=")
+ .append(ToString(compression_options.level))
+ .append("; ");
+ result.append("strategy=")
+ .append(ToString(compression_options.strategy))
+ .append("; ");
+ result.append("max_dict_bytes=")
+ .append(ToString(compression_options.max_dict_bytes))
+ .append("; ");
+ result.append("zstd_max_train_bytes=")
+ .append(ToString(compression_options.zstd_max_train_bytes))
+ .append("; ");
+ result.append("enabled=")
+ .append(ToString(compression_options.enabled))
+ .append("; ");
+ return result;
+}
+
+// compress_format_version can have two values:
+// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
+// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
+// way.
+// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
+// start of compressed block. Snappy format is the same as version 1.
+
+inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
+ size_t length, ::std::string* output) {
+#ifdef SNAPPY
+ output->resize(snappy::MaxCompressedLength(length));
+ size_t outlen;
+ snappy::RawCompress(input, length, &(*output)[0], &outlen);
+ output->resize(outlen);
+ return true;
+#else
+ (void)input;
+ (void)length;
+ (void)output;
+ return false;
+#endif
+}
+
+inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
+ size_t* result) {
+#ifdef SNAPPY
+ return snappy::GetUncompressedLength(input, length, result);
+#else
+ (void)input;
+ (void)length;
+ (void)result;
+ return false;
+#endif
+}
+
+inline bool Snappy_Uncompress(const char* input, size_t length, char* output) {
+#ifdef SNAPPY
+ return snappy::RawUncompress(input, length, output);
+#else
+ (void)input;
+ (void)length;
+ (void)output;
+ return false;
+#endif
+}
+
+namespace compression {
+// returns size
+inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
+ PutVarint32(output, length);
+ return output->size();
+}
+
+inline bool GetDecompressedSizeInfo(const char** input_data,
+ size_t* input_length,
+ uint32_t* output_len) {
+ auto new_input_data =
+ GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
+ if (new_input_data == nullptr) {
+ return false;
+ }
+ *input_length -= (new_input_data - *input_data);
+ *input_data = new_input_data;
+ return true;
+}
+} // namespace compression
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+// dictionary.
+inline bool Zlib_Compress(const CompressionInfo& info,
+ uint32_t compress_format_version, const char* input,
+ size_t length, ::std::string* output) {
+#ifdef ZLIB
+ if (length > std::numeric_limits<uint32_t>::max()) {
+ // Can't compress more than 4GB
+ return false;
+ }
+
+ size_t output_header_len = 0;
+ if (compress_format_version == 2) {
+ output_header_len = compression::PutDecompressedSizeInfo(
+ output, static_cast<uint32_t>(length));
+ }
+ // Resize output to be the plain data length.
+ // This may not be big enough if the compression actually expands data.
+ output->resize(output_header_len + length);
+
+ // The memLevel parameter specifies how much memory should be allocated for
+ // the internal compression state.
+ // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+ // memLevel=9 uses maximum memory for optimal speed.
+ // The default value is 8. See zconf.h for more details.
+ static const int memLevel = 8;
+ int level;
+ if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+ level = Z_DEFAULT_COMPRESSION;
+ } else {
+ level = info.options().level;
+ }
+ z_stream _stream;
+ memset(&_stream, 0, sizeof(z_stream));
+ int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits,
+ memLevel, info.options().strategy);
+ if (st != Z_OK) {
+ return false;
+ }
+
+ Slice compression_dict = info.dict().GetRawDict();
+ if (compression_dict.size()) {
+ // Initialize the compression library's dictionary
+ st = deflateSetDictionary(
+ &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
+ static_cast<unsigned int>(compression_dict.size()));
+ if (st != Z_OK) {
+ deflateEnd(&_stream);
+ return false;
+ }
+ }
+
+ // Compress the input, and put compressed data in output.
+ _stream.next_in = (Bytef*)input;
+ _stream.avail_in = static_cast<unsigned int>(length);
+
+ // Initialize the output size.
+ _stream.avail_out = static_cast<unsigned int>(length);
+ _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
+
+ bool compressed = false;
+ st = deflate(&_stream, Z_FINISH);
+ if (st == Z_STREAM_END) {
+ compressed = true;
+ output->resize(output->size() - _stream.avail_out);
+ }
+ // The only return value we really care about is Z_STREAM_END.
+ // Z_OK means insufficient output space. This means the compression is
+ // bigger than decompressed size. Just fail the compression in that case.
+
+ deflateEnd(&_stream);
+ return compressed;
+#else
+ (void)info;
+ (void)compress_format_version;
+ (void)input;
+ (void)length;
+ (void)output;
+ return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+// dictionary.
+inline CacheAllocationPtr Zlib_Uncompress(
+ const UncompressionInfo& info, const char* input_data, size_t input_length,
+ int* decompress_size, uint32_t compress_format_version,
+ MemoryAllocator* allocator = nullptr, int windowBits = -14) {
+#ifdef ZLIB
+ uint32_t output_len = 0;
+ if (compress_format_version == 2) {
+ if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+ &output_len)) {
+ return nullptr;
+ }
+ } else {
+ // Assume the decompressed data size will 5x of compressed size, but round
+ // to the page size
+ size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+ output_len = static_cast<uint32_t>(
+ std::min(proposed_output_len,
+ static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+ }
+
+ z_stream _stream;
+ memset(&_stream, 0, sizeof(z_stream));
+
+ // For raw inflate, the windowBits should be -8..-15.
+ // If windowBits is bigger than zero, it will use either zlib
+ // header or gzip header. Adding 32 to it will do automatic detection.
+ int st =
+ inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits);
+ if (st != Z_OK) {
+ return nullptr;
+ }
+
+ const Slice& compression_dict = info.dict().GetRawDict();
+ if (compression_dict.size()) {
+ // Initialize the compression library's dictionary
+ st = inflateSetDictionary(
+ &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
+ static_cast<unsigned int>(compression_dict.size()));
+ if (st != Z_OK) {
+ return nullptr;
+ }
+ }
+
+ _stream.next_in = (Bytef*)input_data;
+ _stream.avail_in = static_cast<unsigned int>(input_length);
+
+ auto output = AllocateBlock(output_len, allocator);
+
+ _stream.next_out = (Bytef*)output.get();
+ _stream.avail_out = static_cast<unsigned int>(output_len);
+
+ bool done = false;
+ while (!done) {
+ st = inflate(&_stream, Z_SYNC_FLUSH);
+ switch (st) {
+ case Z_STREAM_END:
+ done = true;
+ break;
+ case Z_OK: {
+ // No output space. Increase the output space by 20%.
+ // We should never run out of output space if
+ // compress_format_version == 2
+ assert(compress_format_version != 2);
+ size_t old_sz = output_len;
+ uint32_t output_len_delta = output_len / 5;
+ output_len += output_len_delta < 10 ? 10 : output_len_delta;
+ auto tmp = AllocateBlock(output_len, allocator);
+ memcpy(tmp.get(), output.get(), old_sz);
+ output = std::move(tmp);
+
+ // Set more output.
+ _stream.next_out = (Bytef*)(output.get() + old_sz);
+ _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+ break;
+ }
+ case Z_BUF_ERROR:
+ default:
+ inflateEnd(&_stream);
+ return nullptr;
+ }
+ }
+
+ // If we encoded decompressed block size, we should have no bytes left
+ assert(compress_format_version != 2 || _stream.avail_out == 0);
+ *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+ inflateEnd(&_stream);
+ return output;
+#else
+ (void)info;
+ (void)input_data;
+ (void)input_length;
+ (void)decompress_size;
+ (void)compress_format_version;
+ (void)allocator;
+ (void)windowBits;
+ return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool BZip2_Compress(const CompressionInfo& /*info*/,
+ uint32_t compress_format_version, const char* input,
+ size_t length, ::std::string* output) {
+#ifdef BZIP2
+ if (length > std::numeric_limits<uint32_t>::max()) {
+ // Can't compress more than 4GB
+ return false;
+ }
+ size_t output_header_len = 0;
+ if (compress_format_version == 2) {
+ output_header_len = compression::PutDecompressedSizeInfo(
+ output, static_cast<uint32_t>(length));
+ }
+ // Resize output to be the plain data length.
+ // This may not be big enough if the compression actually expands data.
+ output->resize(output_header_len + length);
+
+ bz_stream _stream;
+ memset(&_stream, 0, sizeof(bz_stream));
+
+ // Block size 1 is 100K.
+ // 0 is for silent.
+ // 30 is the default workFactor
+ int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
+ if (st != BZ_OK) {
+ return false;
+ }
+
+ // Compress the input, and put compressed data in output.
+ _stream.next_in = (char*)input;
+ _stream.avail_in = static_cast<unsigned int>(length);
+
+ // Initialize the output size.
+ _stream.avail_out = static_cast<unsigned int>(length);
+ _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
+
+ bool compressed = false;
+ st = BZ2_bzCompress(&_stream, BZ_FINISH);
+ if (st == BZ_STREAM_END) {
+ compressed = true;
+ output->resize(output->size() - _stream.avail_out);
+ }
+ // The only return value we really care about is BZ_STREAM_END.
+ // BZ_FINISH_OK means insufficient output space. This means the compression
+ // is bigger than decompressed size. Just fail the compression in that case.
+
+ BZ2_bzCompressEnd(&_stream);
+ return compressed;
+#else
+ (void)compress_format_version;
+ (void)input;
+ (void)length;
+ (void)output;
+ return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline CacheAllocationPtr BZip2_Uncompress(
+ const char* input_data, size_t input_length, int* decompress_size,
+ uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
+#ifdef BZIP2
+ uint32_t output_len = 0;
+ if (compress_format_version == 2) {
+ if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+ &output_len)) {
+ return nullptr;
+ }
+ } else {
+ // Assume the decompressed data size will 5x of compressed size, but round
+ // to the next page size
+ size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+ output_len = static_cast<uint32_t>(
+ std::min(proposed_output_len,
+ static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+ }
+
+ bz_stream _stream;
+ memset(&_stream, 0, sizeof(bz_stream));
+
+ int st = BZ2_bzDecompressInit(&_stream, 0, 0);
+ if (st != BZ_OK) {
+ return nullptr;
+ }
+
+ _stream.next_in = (char*)input_data;
+ _stream.avail_in = static_cast<unsigned int>(input_length);
+
+ auto output = AllocateBlock(output_len, allocator);
+
+ _stream.next_out = (char*)output.get();
+ _stream.avail_out = static_cast<unsigned int>(output_len);
+
+ bool done = false;
+ while (!done) {
+ st = BZ2_bzDecompress(&_stream);
+ switch (st) {
+ case BZ_STREAM_END:
+ done = true;
+ break;
+ case BZ_OK: {
+ // No output space. Increase the output space by 20%.
+ // We should never run out of output space if
+ // compress_format_version == 2
+ assert(compress_format_version != 2);
+ uint32_t old_sz = output_len;
+ output_len = output_len * 1.2;
+ auto tmp = AllocateBlock(output_len, allocator);
+ memcpy(tmp.get(), output.get(), old_sz);
+ output = std::move(tmp);
+
+ // Set more output.
+ _stream.next_out = (char*)(output.get() + old_sz);
+ _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+ break;
+ }
+ default:
+ BZ2_bzDecompressEnd(&_stream);
+ return nullptr;
+ }
+ }
+
+ // If we encoded decompressed block size, we should have no bytes left
+ assert(compress_format_version != 2 || _stream.avail_out == 0);
+ *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+ BZ2_bzDecompressEnd(&_stream);
+ return output;
+#else
+ (void)input_data;
+ (void)input_length;
+ (void)decompress_size;
+ (void)compress_format_version;
+ (void)allocator;
+ return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+// dictionary.
+inline bool LZ4_Compress(const CompressionInfo& info,
+ uint32_t compress_format_version, const char* input,
+ size_t length, ::std::string* output) {
+#ifdef LZ4
+ if (length > std::numeric_limits<uint32_t>::max()) {
+ // Can't compress more than 4GB
+ return false;
+ }
+
+ size_t output_header_len = 0;
+ if (compress_format_version == 2) {
+ // new encoding, using varint32 to store size information
+ output_header_len = compression::PutDecompressedSizeInfo(
+ output, static_cast<uint32_t>(length));
+ } else {
+ // legacy encoding, which is not really portable (depends on big/little
+ // endianness)
+ output_header_len = 8;
+ output->resize(output_header_len);
+ char* p = const_cast<char*>(output->c_str());
+ memcpy(p, &length, sizeof(length));
+ }
+ int compress_bound = LZ4_compressBound(static_cast<int>(length));
+ output->resize(static_cast<size_t>(output_header_len + compress_bound));
+
+ int outlen;
+#if LZ4_VERSION_NUMBER >= 10400 // r124+
+ LZ4_stream_t* stream = LZ4_createStream();
+ Slice compression_dict = info.dict().GetRawDict();
+ if (compression_dict.size()) {
+ LZ4_loadDict(stream, compression_dict.data(),
+ static_cast<int>(compression_dict.size()));
+ }
+#if LZ4_VERSION_NUMBER >= 10700 // r129+
+ outlen =
+ LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len],
+ static_cast<int>(length), compress_bound, 1);
+#else // up to r128
+ outlen = LZ4_compress_limitedOutput_continue(
+ stream, input, &(*output)[output_header_len], static_cast<int>(length),
+ compress_bound);
+#endif
+ LZ4_freeStream(stream);
+#else // up to r123
+ outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
+ static_cast<int>(length), compress_bound);
+#endif // LZ4_VERSION_NUMBER >= 10400
+
+ if (outlen == 0) {
+ return false;
+ }
+ output->resize(static_cast<size_t>(output_header_len + outlen));
+ return true;
+#else // LZ4
+ (void)info;
+ (void)compress_format_version;
+ (void)input;
+ (void)length;
+ (void)output;
+ return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+// dictionary.
+inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
+ const char* input_data,
+ size_t input_length,
+ int* decompress_size,
+ uint32_t compress_format_version,
+ MemoryAllocator* allocator = nullptr) {
+#ifdef LZ4
+ uint32_t output_len = 0;
+ if (compress_format_version == 2) {
+ // new encoding, using varint32 to store size information
+ if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+ &output_len)) {
+ return nullptr;
+ }
+ } else {
+ // legacy encoding, which is not really portable (depends on big/little
+ // endianness)
+ if (input_length < 8) {
+ return nullptr;
+ }
+ memcpy(&output_len, input_data, sizeof(output_len));
+ input_length -= 8;
+ input_data += 8;
+ }
+
+ auto output = AllocateBlock(output_len, allocator);
+#if LZ4_VERSION_NUMBER >= 10400 // r124+
+ LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
+ const Slice& compression_dict = info.dict().GetRawDict();
+ if (compression_dict.size()) {
+ LZ4_setStreamDecode(stream, compression_dict.data(),
+ static_cast<int>(compression_dict.size()));
+ }
+ *decompress_size = LZ4_decompress_safe_continue(
+ stream, input_data, output.get(), static_cast<int>(input_length),
+ static_cast<int>(output_len));
+ LZ4_freeStreamDecode(stream);
+#else // up to r123
+ *decompress_size = LZ4_decompress_safe(input_data, output.get(),
+ static_cast<int>(input_length),
+ static_cast<int>(output_len));
+#endif // LZ4_VERSION_NUMBER >= 10400
+
+ if (*decompress_size < 0) {
+ return nullptr;
+ }
+ assert(*decompress_size == static_cast<int>(output_len));
+ return output;
+#else // LZ4
+ (void)info;
+ (void)input_data;
+ (void)input_length;
+ (void)decompress_size;
+ (void)compress_format_version;
+ (void)allocator;
+ return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+// dictionary.
+inline bool LZ4HC_Compress(const CompressionInfo& info,
+ uint32_t compress_format_version, const char* input,
+ size_t length, ::std::string* output) {
+#ifdef LZ4
+ if (length > std::numeric_limits<uint32_t>::max()) {
+ // Can't compress more than 4GB
+ return false;
+ }
+
+ size_t output_header_len = 0;
+ if (compress_format_version == 2) {
+ // new encoding, using varint32 to store size information
+ output_header_len = compression::PutDecompressedSizeInfo(
+ output, static_cast<uint32_t>(length));
+ } else {
+ // legacy encoding, which is not really portable (depends on big/little
+ // endianness)
+ output_header_len = 8;
+ output->resize(output_header_len);
+ char* p = const_cast<char*>(output->c_str());
+ memcpy(p, &length, sizeof(length));
+ }
+ int compress_bound = LZ4_compressBound(static_cast<int>(length));
+ output->resize(static_cast<size_t>(output_header_len + compress_bound));
+
+ int outlen;
+ int level;
+ if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+ level = 0; // lz4hc.h says any value < 1 will be sanitized to default
+ } else {
+ level = info.options().level;
+ }
+#if LZ4_VERSION_NUMBER >= 10400 // r124+
+ LZ4_streamHC_t* stream = LZ4_createStreamHC();
+ LZ4_resetStreamHC(stream, level);
+ Slice compression_dict = info.dict().GetRawDict();
+ const char* compression_dict_data =
+ compression_dict.size() > 0 ? compression_dict.data() : nullptr;
+ size_t compression_dict_size = compression_dict.size();
+ LZ4_loadDictHC(stream, compression_dict_data,
+ static_cast<int>(compression_dict_size));
+
+#if LZ4_VERSION_NUMBER >= 10700 // r129+
+ outlen =
+ LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len],
+ static_cast<int>(length), compress_bound);
+#else // r124-r128
+ outlen = LZ4_compressHC_limitedOutput_continue(
+ stream, input, &(*output)[output_header_len], static_cast<int>(length),
+ compress_bound);
+#endif // LZ4_VERSION_NUMBER >= 10700
+ LZ4_freeStreamHC(stream);
+
+#elif LZ4_VERSION_MAJOR // r113-r123
+ outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
+ static_cast<int>(length),
+ compress_bound, level);
+#else // up to r112
+ outlen =
+ LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
+ static_cast<int>(length), compress_bound);
+#endif // LZ4_VERSION_NUMBER >= 10400
+
+ if (outlen == 0) {
+ return false;
+ }
+ output->resize(static_cast<size_t>(output_header_len + outlen));
+ return true;
+#else // LZ4
+ (void)info;
+ (void)compress_format_version;
+ (void)input;
+ (void)length;
+ (void)output;
+ return false;
+#endif
+}
+
+#ifdef XPRESS
+inline bool XPRESS_Compress(const char* input, size_t length,
+ std::string* output) {
+ return port::xpress::Compress(input, length, output);
+}
+#else
+inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/,
+ std::string* /*output*/) {
+ return false;
+}
+#endif
+
+#ifdef XPRESS
+inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
+ int* decompress_size) {
+ return port::xpress::Decompress(input_data, input_length, decompress_size);
+}
+#else
+inline char* XPRESS_Uncompress(const char* /*input_data*/,
+ size_t /*input_length*/,
+ int* /*decompress_size*/) {
+ return nullptr;
+}
+#endif
+
+inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
+ size_t length, ::std::string* output) {
+#ifdef ZSTD
+ if (length > std::numeric_limits<uint32_t>::max()) {
+ // Can't compress more than 4GB
+ return false;
+ }
+
+ size_t output_header_len = compression::PutDecompressedSizeInfo(
+ output, static_cast<uint32_t>(length));
+
+ size_t compressBound = ZSTD_compressBound(length);
+ output->resize(static_cast<size_t>(output_header_len + compressBound));
+ size_t outlen = 0;
+ int level;
+ if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+ // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+ // https://github.com/facebook/zstd/issues/1148
+ level = 3;
+ } else {
+ level = info.options().level;
+ }
+#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+
+ ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
+ assert(context != nullptr);
+#if ZSTD_VERSION_NUMBER >= 700 // v0.7.0+
+ if (info.dict().GetDigestedZstdCDict() != nullptr) {
+ outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len],
+ compressBound, input, length,
+ info.dict().GetDigestedZstdCDict());
+ }
+#endif // ZSTD_VERSION_NUMBER >= 700
+ if (outlen == 0) {
+ outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len],
+ compressBound, input, length,
+ info.dict().GetRawDict().data(),
+ info.dict().GetRawDict().size(), level);
+ }
+#else // up to v0.4.x
+ outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input,
+ length, level);
+#endif // ZSTD_VERSION_NUMBER >= 500
+ if (outlen == 0) {
+ return false;
+ }
+ output->resize(output_header_len + outlen);
+ return true;
+#else // ZSTD
+ (void)info;
+ (void)input;
+ (void)length;
+ (void)output;
+ return false;
+#endif
+}
+
+// @param compression_dict Data for presetting the compression library's
+// dictionary.
+inline CacheAllocationPtr ZSTD_Uncompress(
+ const UncompressionInfo& info, const char* input_data, size_t input_length,
+ int* decompress_size, MemoryAllocator* allocator = nullptr) {
+#ifdef ZSTD
+ uint32_t output_len = 0;
+ if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+ &output_len)) {
+ return nullptr;
+ }
+
+ auto output = AllocateBlock(output_len, allocator);
+ size_t actual_output_length = 0;
+#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+
+ ZSTD_DCtx* context = info.context().GetZSTDContext();
+ assert(context != nullptr);
+#ifdef ROCKSDB_ZSTD_DDICT
+ if (info.dict().GetDigestedZstdDDict() != nullptr) {
+ actual_output_length = ZSTD_decompress_usingDDict(
+ context, output.get(), output_len, input_data, input_length,
+ info.dict().GetDigestedZstdDDict());
+ }
+#endif // ROCKSDB_ZSTD_DDICT
+ if (actual_output_length == 0) {
+ actual_output_length = ZSTD_decompress_usingDict(
+ context, output.get(), output_len, input_data, input_length,
+ info.dict().GetRawDict().data(), info.dict().GetRawDict().size());
+ }
+#else // up to v0.4.x
+ (void)info;
+ actual_output_length =
+ ZSTD_decompress(output.get(), output_len, input_data, input_length);
+#endif // ZSTD_VERSION_NUMBER >= 500
+ assert(actual_output_length == output_len);
+ *decompress_size = static_cast<int>(actual_output_length);
+ return output;
+#else // ZSTD
+ (void)info;
+ (void)input_data;
+ (void)input_length;
+ (void)decompress_size;
+ (void)allocator;
+ return nullptr;
+#endif
+}
+
+inline bool ZSTD_TrainDictionarySupported() {
+#ifdef ZSTD
+ // Dictionary trainer is available since v0.6.1 for static linking, but not
+ // available for dynamic linking until v1.1.3. For now we enable the feature
+ // in v1.1.3+ only.
+ return (ZSTD_versionNumber() >= 10103);
+#else
+ return false;
+#endif
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+ const std::vector<size_t>& sample_lens,
+ size_t max_dict_bytes) {
+ // Dictionary trainer is available since v0.6.1 for static linking, but not
+ // available for dynamic linking until v1.1.3. For now we enable the feature
+ // in v1.1.3+ only.
+#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+
+ assert(samples.empty() == sample_lens.empty());
+ if (samples.empty()) {
+ return "";
+ }
+ std::string dict_data(max_dict_bytes, '\0');
+ size_t dict_len = ZDICT_trainFromBuffer(
+ &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
+ static_cast<unsigned>(sample_lens.size()));
+ if (ZDICT_isError(dict_len)) {
+ return "";
+ }
+ assert(dict_len <= max_dict_bytes);
+ dict_data.resize(dict_len);
+ return dict_data;
+#else // up to v1.1.2
+ assert(false);
+ (void)samples;
+ (void)sample_lens;
+ (void)max_dict_bytes;
+ return "";
+#endif // ZSTD_VERSION_NUMBER >= 10103
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+ size_t sample_len_shift,
+ size_t max_dict_bytes) {
+ // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable
+ // only since v0.8.0. For now we enable the feature in stable versions only.
+#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+
+ // skips potential partial sample at the end of "samples"
+ size_t num_samples = samples.size() >> sample_len_shift;
+ std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
+ return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
+#else // up to v1.1.2
+ assert(false);
+ (void)samples;
+ (void)sample_len_shift;
+ (void)max_dict_bytes;
+ return "";
+#endif // ZSTD_VERSION_NUMBER >= 10103
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/compression_context_cache.cc b/src/rocksdb/util/compression_context_cache.cc
new file mode 100644
index 000000000..a06cfb279
--- /dev/null
+++ b/src/rocksdb/util/compression_context_cache.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#include "util/compression_context_cache.h"
+
+#include "util/compression.h"
+#include "util/core_local.h"
+
+#include <atomic>
+
+namespace ROCKSDB_NAMESPACE {
+namespace compression_cache {
+
+void* const SentinelValue = nullptr;
+// Cache ZSTD uncompression contexts for reads
+// if needed we can add ZSTD compression context caching
+// which is currently is not done since BlockBasedTableBuilder
+// simply creates one compression context per new SST file.
+struct ZSTDCachedData {
+ // We choose to cache the below structure instead of a ptr
+ // because we want to avoid a) native types leak b) make
+ // cache use transparent for the user
+ ZSTDUncompressCachedData uncomp_cached_data_;
+ std::atomic<void*> zstd_uncomp_sentinel_;
+
+ char
+ padding[(CACHE_LINE_SIZE -
+ (sizeof(ZSTDUncompressCachedData) + sizeof(std::atomic<void*>)) %
+ CACHE_LINE_SIZE)]; // unused padding field
+
+ ZSTDCachedData() : zstd_uncomp_sentinel_(&uncomp_cached_data_) {}
+ ZSTDCachedData(const ZSTDCachedData&) = delete;
+ ZSTDCachedData& operator=(const ZSTDCachedData&) = delete;
+
+ ZSTDUncompressCachedData GetUncompressData(int64_t idx) {
+ ZSTDUncompressCachedData result;
+ void* expected = &uncomp_cached_data_;
+ if (zstd_uncomp_sentinel_.compare_exchange_strong(expected,
+ SentinelValue)) {
+ uncomp_cached_data_.CreateIfNeeded();
+ result.InitFromCache(uncomp_cached_data_, idx);
+ } else {
+ // Creates one time use data
+ result.CreateIfNeeded();
+ }
+ return result;
+ }
+ // Return the entry back into circulation
+ // This is executed only when we successfully obtained
+ // in the first place
+ void ReturnUncompressData() {
+ if (zstd_uncomp_sentinel_.exchange(&uncomp_cached_data_) != SentinelValue) {
+ // Means we are returning while not having it acquired.
+ assert(false);
+ }
+ }
+};
+static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0,
+ "Expected CACHE_LINE_SIZE alignment");
+} // namespace compression_cache
+
+using namespace compression_cache;
+
+class CompressionContextCache::Rep {
+ public:
+ Rep() {}
+ ZSTDUncompressCachedData GetZSTDUncompressData() {
+ auto p = per_core_uncompr_.AccessElementAndIndex();
+ int64_t idx = static_cast<int64_t>(p.second);
+ return p.first->GetUncompressData(idx);
+ }
+ void ReturnZSTDUncompressData(int64_t idx) {
+ assert(idx >= 0);
+ auto* cn = per_core_uncompr_.AccessAtCore(static_cast<size_t>(idx));
+ cn->ReturnUncompressData();
+ }
+
+ private:
+ CoreLocalArray<ZSTDCachedData> per_core_uncompr_;
+};
+
+CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {}
+
+CompressionContextCache* CompressionContextCache::Instance() {
+ static CompressionContextCache instance;
+ return &instance;
+}
+
+void CompressionContextCache::InitSingleton() { Instance(); }
+
+ZSTDUncompressCachedData
+CompressionContextCache::GetCachedZSTDUncompressData() {
+ return rep_->GetZSTDUncompressData();
+}
+
+void CompressionContextCache::ReturnCachedZSTDUncompressData(int64_t idx) {
+ rep_->ReturnZSTDUncompressData(idx);
+}
+
+CompressionContextCache::~CompressionContextCache() { delete rep_; }
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/compression_context_cache.h b/src/rocksdb/util/compression_context_cache.h
new file mode 100644
index 000000000..7b7b2d507
--- /dev/null
+++ b/src/rocksdb/util/compression_context_cache.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+// Compression context cache allows to cache compression/uncompression contexts
+// This helps with Random Read latencies and reduces CPU utilization
+// Caching is implemented using CoreLocal facility. Compression/Uncompression
+// instances are cached on a per core basis using CoreLocalArray. A borrowed
+// instance is atomically replaced with a sentinel value for the time of being
+// used. If it turns out that another thread is already makes use of the
+// instance we still create one on the heap which is later is destroyed.
+
+#pragma once
+
+#include <stdint.h>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ZSTDUncompressCachedData;
+
+class CompressionContextCache {
+ public:
+ // Singleton
+ static CompressionContextCache* Instance();
+ static void InitSingleton();
+ CompressionContextCache(const CompressionContextCache&) = delete;
+ CompressionContextCache& operator=(const CompressionContextCache&) = delete;
+
+ ZSTDUncompressCachedData GetCachedZSTDUncompressData();
+ void ReturnCachedZSTDUncompressData(int64_t idx);
+
+ private:
+ // Singleton
+ CompressionContextCache();
+ ~CompressionContextCache();
+
+ class Rep;
+ Rep* rep_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.cc b/src/rocksdb/util/concurrent_task_limiter_impl.cc
new file mode 100644
index 000000000..2342677d8
--- /dev/null
+++ b/src/rocksdb/util/concurrent_task_limiter_impl.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/concurrent_task_limiter_impl.h"
+#include "rocksdb/concurrent_task_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl(
+ const std::string& name, int32_t max_outstanding_task)
+ : name_(name),
+ max_outstanding_tasks_{max_outstanding_task},
+ outstanding_tasks_{0} {
+
+}
+
+ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() {
+ assert(outstanding_tasks_ == 0);
+}
+
+const std::string& ConcurrentTaskLimiterImpl::GetName() const {
+ return name_;
+}
+
+void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) {
+ max_outstanding_tasks_.store(limit, std::memory_order_relaxed);
+}
+
+void ConcurrentTaskLimiterImpl::ResetMaxOutstandingTask() {
+ max_outstanding_tasks_.store(-1, std::memory_order_relaxed);
+}
+
+int32_t ConcurrentTaskLimiterImpl::GetOutstandingTask() const {
+ return outstanding_tasks_.load(std::memory_order_relaxed);
+}
+
+std::unique_ptr<TaskLimiterToken> ConcurrentTaskLimiterImpl::GetToken(
+ bool force) {
+ int32_t limit = max_outstanding_tasks_.load(std::memory_order_relaxed);
+ int32_t tasks = outstanding_tasks_.load(std::memory_order_relaxed);
+ // force = true, bypass the throttle.
+ // limit < 0 means unlimited tasks.
+ while (force || limit < 0 || tasks < limit) {
+ if (outstanding_tasks_.compare_exchange_weak(tasks, tasks + 1)) {
+ return std::unique_ptr<TaskLimiterToken>(new TaskLimiterToken(this));
+ }
+ }
+ return nullptr;
+}
+
+ConcurrentTaskLimiter* NewConcurrentTaskLimiter(
+ const std::string& name, int32_t limit) {
+ return new ConcurrentTaskLimiterImpl(name, limit);
+}
+
+TaskLimiterToken::~TaskLimiterToken() {
+ --limiter_->outstanding_tasks_;
+ assert(limiter_->outstanding_tasks_ >= 0);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.h b/src/rocksdb/util/concurrent_task_limiter_impl.h
new file mode 100644
index 000000000..d8c1e03cb
--- /dev/null
+++ b/src/rocksdb/util/concurrent_task_limiter_impl.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/env.h"
+#include "rocksdb/concurrent_task_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TaskLimiterToken;
+
+class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
+ public:
+ explicit ConcurrentTaskLimiterImpl(const std::string& name,
+ int32_t max_outstanding_task);
+ // No copying allowed
+ ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
+ ConcurrentTaskLimiterImpl& operator=(const ConcurrentTaskLimiterImpl&) =
+ delete;
+
+ virtual ~ConcurrentTaskLimiterImpl();
+
+ virtual const std::string& GetName() const override;
+
+ virtual void SetMaxOutstandingTask(int32_t limit) override;
+
+ virtual void ResetMaxOutstandingTask() override;
+
+ virtual int32_t GetOutstandingTask() const override;
+
+ // Request token for adding a new task.
+ // If force == true, it requests a token bypassing throttle.
+ // Returns nullptr if it got throttled.
+ virtual std::unique_ptr<TaskLimiterToken> GetToken(bool force);
+
+ private:
+ friend class TaskLimiterToken;
+
+ std::string name_;
+ std::atomic<int32_t> max_outstanding_tasks_;
+ std::atomic<int32_t> outstanding_tasks_;
+};
+
+class TaskLimiterToken {
+ public:
+ explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter)
+ : limiter_(limiter) {}
+ ~TaskLimiterToken();
+
+ private:
+ ConcurrentTaskLimiterImpl* limiter_;
+
+ // no copying allowed
+ TaskLimiterToken(const TaskLimiterToken&) = delete;
+ void operator=(const TaskLimiterToken&) = delete;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h
new file mode 100644
index 000000000..b444a1152
--- /dev/null
+++ b/src/rocksdb/util/core_local.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An array of core-local values. Ideally the value type, T, is cache aligned to
+// prevent false sharing.
+template <typename T>
+class CoreLocalArray {
+ public:
+ CoreLocalArray();
+
+ size_t Size() const;
+ // returns pointer to the element corresponding to the core that the thread
+ // currently runs on.
+ T* Access() const;
+ // same as above, but also returns the core index, which the client can cache
+ // to reduce how often core ID needs to be retrieved. Only do this if some
+ // inaccuracy is tolerable, as the thread may migrate to a different core.
+ std::pair<T*, size_t> AccessElementAndIndex() const;
+ // returns pointer to element for the specified core index. This can be used,
+ // e.g., for aggregation, or if the client caches core index.
+ T* AccessAtCore(size_t core_idx) const;
+
+ private:
+ std::unique_ptr<T[]> data_;
+ int size_shift_;
+};
+
+template <typename T>
+CoreLocalArray<T>::CoreLocalArray() {
+ int num_cpus = static_cast<int>(std::thread::hardware_concurrency());
+ // find a power of two >= num_cpus and >= 8
+ size_shift_ = 3;
+ while (1 << size_shift_ < num_cpus) {
+ ++size_shift_;
+ }
+ data_.reset(new T[static_cast<size_t>(1) << size_shift_]);
+}
+
+template <typename T>
+size_t CoreLocalArray<T>::Size() const {
+ return static_cast<size_t>(1) << size_shift_;
+}
+
+template <typename T>
+T* CoreLocalArray<T>::Access() const {
+ return AccessElementAndIndex().first;
+}
+
+template <typename T>
+std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
+ int cpuid = port::PhysicalCoreID();
+ size_t core_idx;
+ if (UNLIKELY(cpuid < 0)) {
+ // cpu id unavailable, just pick randomly
+ core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_);
+ } else {
+ core_idx = static_cast<size_t>(cpuid & ((1 << size_shift_) - 1));
+ }
+ return {AccessAtCore(core_idx), core_idx};
+}
+
+template <typename T>
+T* CoreLocalArray<T>::AccessAtCore(size_t core_idx) const {
+ assert(core_idx < static_cast<size_t>(1) << size_shift_);
+ return &data_[core_idx];
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/crc32c.cc b/src/rocksdb/util/crc32c.cc
new file mode 100644
index 000000000..f0eee2f4f
--- /dev/null
+++ b/src/rocksdb/util/crc32c.cc
@@ -0,0 +1,1263 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A portable implementation of crc32c, optimized to handle
+// four bytes at a time.
+#include "util/crc32c.h"
+#include <stdint.h>
+#ifdef HAVE_SSE42
+#include <nmmintrin.h>
+#include <wmmintrin.h>
+#endif
+#include "util/coding.h"
+#include "util/util.h"
+
+#include "util/crc32c_arm64.h"
+
+#ifdef __powerpc64__
+#include "util/crc32c_ppc.h"
+#include "util/crc32c_ppc_constants.h"
+
+#if __linux__
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+#include <sys/auxv.h>
+#endif
+
+#ifndef PPC_FEATURE2_VEC_CRYPTO
+#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#endif /* __linux__ */
+
+#endif
+
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+bool pmull_runtime_flag = false;
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+namespace crc32c {
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+#ifdef __powerpc64__
+static int arch_ppc_crc32 = 0;
+#endif /* __powerpc64__ */
+#endif
+
+static const uint32_t table0_[256] = {
+ 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+ 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+ 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+ 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+ 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+ 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+ 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+ 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+ 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+ 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+ 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+ 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+ 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+ 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+ 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+ 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+ 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+ 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+ 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+ 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+ 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+ 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+ 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+ 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+ 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+ 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+ 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+ 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+ 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+ 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+ 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+ 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+ 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+ 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+ 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+ 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+ 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+ 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+ 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+ 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+ 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+ 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+ 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+ 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+ 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+ 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+ 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+ 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+ 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+ 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+ 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+ 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+ 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+ 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+ 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+ 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+ 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+ 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+ 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+ 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+ 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+ 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+ 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+ 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+};
+static const uint32_t table1_[256] = {
+ 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+ 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+ 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+ 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+ 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+ 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+ 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+ 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+ 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+ 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+ 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+ 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+ 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+ 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+ 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+ 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+ 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+ 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+ 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+ 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+ 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+ 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+ 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+ 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+ 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+ 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+ 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+ 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+ 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+ 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+ 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+ 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+ 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+ 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+ 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+ 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+ 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+ 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+ 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+ 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+ 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+ 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+ 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+ 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+ 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+ 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+ 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+ 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+ 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+ 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+ 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+ 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+ 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+ 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+ 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+ 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+ 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+ 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+ 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+ 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+ 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+ 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+ 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+ 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+};
+static const uint32_t table2_[256] = {
+ 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+ 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+ 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+ 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+ 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+ 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+ 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+ 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+ 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+ 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+ 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+ 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+ 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+ 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+ 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+ 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+ 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+ 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+ 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+ 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+ 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+ 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+ 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+ 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+ 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+ 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+ 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+ 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+ 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+ 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+ 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+ 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+ 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+ 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+ 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+ 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+ 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+ 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+ 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+ 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+ 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+ 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+ 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+ 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+ 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+ 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+ 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+ 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+ 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+ 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+ 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+ 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+ 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+ 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+ 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+ 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+ 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+ 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+ 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+ 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+ 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+ 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+ 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+ 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+};
+static const uint32_t table3_[256] = {
+ 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+ 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+ 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+ 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+ 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+ 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+ 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+ 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+ 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+ 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+ 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+ 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+ 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+ 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+ 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+ 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+ 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+ 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+ 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+ 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+ 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+ 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+ 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+ 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+ 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+ 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+ 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+ 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+ 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+ 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+ 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+ 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+ 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+ 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+ 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+ 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+ 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+ 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+ 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+ 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+ 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+ 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+ 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+ 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+ 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+ 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+ 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+ 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+ 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+ 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+ 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+ 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+ 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+ 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+ 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+ 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+ 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+ 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+ 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+ 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+ 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+ 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+ 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+ 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+};
+
+// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
+static inline uint32_t LE_LOAD32(const uint8_t *p) {
+ return DecodeFixed32(reinterpret_cast<const char*>(p));
+}
+
+#if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64))
+static inline uint64_t LE_LOAD64(const uint8_t *p) {
+ return DecodeFixed64(reinterpret_cast<const char*>(p));
+}
+#endif
+
+static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
+ uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+ *p += 4;
+ *l = table3_[c & 0xff] ^
+ table2_[(c >> 8) & 0xff] ^
+ table1_[(c >> 16) & 0xff] ^
+ table0_[c >> 24];
+ // DO it twice.
+ c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+ *p += 4;
+ *l = table3_[c & 0xff] ^
+ table2_[(c >> 8) & 0xff] ^
+ table1_[(c >> 16) & 0xff] ^
+ table0_[c >> 24];
+}
+
+static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
+#ifndef HAVE_SSE42
+ Slow_CRC32(l, p);
+#elif defined(__LP64__) || defined(_WIN64)
+ *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
+ *p += 8;
+#else
+ *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+ *p += 4;
+ *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+ *p += 4;
+#endif
+}
+
+template<void (*CRC32)(uint64_t*, uint8_t const**)>
+uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
+
+ const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+ const uint8_t *e = p + size;
+ uint64_t l = crc ^ 0xffffffffu;
+
+// Align n to (1 << m) byte boundary
+#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1))
+
+#define STEP1 do { \
+ int c = (l & 0xff) ^ *p++; \
+ l = table0_[c] ^ (l >> 8); \
+} while (0)
+
+
+ // Point x at first 16-byte aligned byte in string. This might be
+ // just past the end of the string.
+ const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+ const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
+ if (x <= e) {
+ // Process bytes until finished or p is 16-byte aligned
+ while (p != x) {
+ STEP1;
+ }
+ }
+ // Process bytes 16 at a time
+ while ((e-p) >= 16) {
+ CRC32(&l, &p);
+ CRC32(&l, &p);
+ }
+ // Process bytes 8 at a time
+ while ((e-p) >= 8) {
+ CRC32(&l, &p);
+ }
+ // Process the last few bytes
+ while (p != e) {
+ STEP1;
+ }
+#undef STEP1
+#undef ALIGN
+ return static_cast<uint32_t>(l ^ 0xffffffffu);
+}
+
+// Detect if ARM64 CRC or not.
+#ifndef HAVE_ARM64_CRC
+// Detect if SS42 or not.
+#ifndef HAVE_POWER8
+
+static bool isSSE42() {
+#ifndef HAVE_SSE42
+ return false;
+#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+ uint32_t c_;
+ __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+ return c_ & (1U << 20); // copied from CpuId.h in Folly. Test SSE42
+#elif defined(_WIN64)
+ int info[4];
+ __cpuidex(info, 0x00000001, 0);
+ return (info[2] & ((int)1 << 20)) != 0;
+#else
+ return false;
+#endif
+}
+
+static bool isPCLMULQDQ() {
+#ifndef HAVE_SSE42
+// in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ are
+// supported by compiler
+ return false;
+#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+ uint32_t c_;
+ __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+ return c_ & (1U << 1); // PCLMULQDQ is in bit 1 (not bit 0)
+#elif defined(_WIN64)
+ int info[4];
+ __cpuidex(info, 0x00000001, 0);
+ return (info[2] & ((int)1 << 1)) != 0;
+#else
+ return false;
+#endif
+}
+
+#endif // HAVE_POWER8
+#endif // HAVE_ARM64_CRC
+
+typedef uint32_t (*Function)(uint32_t, const char*, size_t);
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) {
+ return crc32c_ppc(crc, (const unsigned char *)buf, size);
+}
+
+#if __linux__
+static int arch_ppc_probe(void) {
+ arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__) && defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+ if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif /* __powerpc64__ */
+
+ return arch_ppc_crc32;
+}
+#endif // __linux__
+
+static bool isAltiVec() {
+ if (arch_ppc_probe()) {
+ return true;
+ } else {
+ return false;
+ }
+}
+#endif
+
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
+ return crc32c_arm64(crc, (const unsigned char *)buf, size);
+}
+#endif
+
+std::string IsFastCrc32Supported() {
+ bool has_fast_crc = false;
+ std::string fast_zero_msg;
+ std::string arch;
+#ifdef HAVE_POWER8
+#ifdef HAS_ALTIVEC
+ if (arch_ppc_probe()) {
+ has_fast_crc = true;
+ arch = "PPC";
+ }
+#else
+ has_fast_crc = false;
+ arch = "PPC";
+#endif
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+ if (crc32c_runtime_check()) {
+ has_fast_crc = true;
+ arch = "Arm64";
+ pmull_runtime_flag = crc32c_pmull_runtime_check();
+ } else {
+ has_fast_crc = false;
+ arch = "Arm64";
+ }
+#else
+ has_fast_crc = isSSE42();
+ arch = "x86";
+#endif
+ if (has_fast_crc) {
+ fast_zero_msg.append("Supported on " + arch);
+ }
+ else {
+ fast_zero_msg.append("Not supported on " + arch);
+ }
+ return fast_zero_msg;
+}
+
+
+/*
+ * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the author be held liable for any damages
+ * arising from the use of this software.
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ * Ferry Toth
+ * ftoth@exalondelft.nl
+ *
+ * https://github.com/htot/crc32c
+ *
+ * Modified by Facebook
+ *
+ * Original intel whitepaper:
+ * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ *
+ * This version is from the folly library, created by Dave Watson <davejwatson@fb.com>
+ *
+*/
+#if defined HAVE_SSE42 && defined HAVE_PCLMUL
+
+#define CRCtriplet(crc, buf, offset) \
+ crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+ crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
+ crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
+
+#define CRCduplet(crc, buf, offset) \
+ crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+ crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
+
+#define CRCsinglet(crc, buf, offset) \
+ crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
+
+
+// Numbers taken directly from intel whitepaper.
+// clang-format off
+const uint64_t clmul_constants[] = {
+ 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
+ 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
+ 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
+ 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
+ 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
+ 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
+ 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
+ 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
+ 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
+ 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
+ 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
+ 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
+ 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
+ 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
+ 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
+ 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
+ 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
+ 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
+ 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
+ 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
+ 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
+ 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
+ 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
+ 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
+ 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
+ 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
+ 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
+ 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
+ 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
+ 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
+ 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
+ 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
+ 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
+ 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
+ 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
+ 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
+ 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
+ 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
+ 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
+ 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
+ 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
+ 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
+ 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
+ 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
+ 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
+ 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
+ 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
+ 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
+ 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
+ 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
+ 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
+ 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
+ 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
+ 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
+ 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
+ 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
+ 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
+ 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
+ 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
+ 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
+ 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
+ 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
+ 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
+ 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
+};
+
+// Compute the crc32c value for buffer smaller than 8
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void align_to_8(
+ size_t len,
+ uint64_t& crc0, // crc so far, updated on return
+ const unsigned char*& next) { // next data pointer, updated on return
+ uint32_t crc32bit = static_cast<uint32_t>(crc0);
+ if (len & 0x04) {
+ crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
+ next += sizeof(uint32_t);
+ }
+ if (len & 0x02) {
+ crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
+ next += sizeof(uint16_t);
+ }
+ if (len & 0x01) {
+ crc32bit = _mm_crc32_u8(crc32bit, *(next));
+ next++;
+ }
+ crc0 = crc32bit;
+}
+
+//
+// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
+// chosen constant and xor's these with the remaining CRC.
+//
+inline uint64_t CombineCRC(
+ size_t block_size,
+ uint64_t crc0,
+ uint64_t crc1,
+ uint64_t crc2,
+ const uint64_t* next2) {
+ const auto multiplier =
+ *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
+ const auto crc0_xmm = _mm_set_epi64x(0, crc0);
+ const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
+ const auto crc1_xmm = _mm_set_epi64x(0, crc1);
+ const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
+ const auto res = _mm_xor_si128(res0, res1);
+ crc0 = _mm_cvtsi128_si64(res);
+ crc0 = crc0 ^ *((uint64_t*)next2 - 1);
+ crc2 = _mm_crc32_u64(crc2, crc0);
+ return crc2;
+}
+
+// Compute CRC-32C using the Intel hardware instruction.
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
+ const unsigned char* next = (const unsigned char*)buf;
+ uint64_t count;
+ uint64_t crc0, crc1, crc2;
+ crc0 = crc ^ 0xffffffffu;
+
+
+ if (len >= 8) {
+ // if len > 216 then align and use triplets
+ if (len > 216) {
+ {
+ // Work on the bytes (< 8) before the first 8-byte alignment addr starts
+ uint64_t align_bytes = (8 - (uintptr_t)next) & 7;
+ len -= align_bytes;
+ align_to_8(align_bytes, crc0, next);
+ }
+
+ // Now work on the remaining blocks
+ count = len / 24; // number of triplets
+ len %= 24; // bytes remaining
+ uint64_t n = count >> 7; // #blocks = first block + full blocks
+ uint64_t block_size = count & 127;
+ if (block_size == 0) {
+ block_size = 128;
+ } else {
+ n++;
+ }
+ // points to the first byte of the next block
+ const uint64_t* next0 = (uint64_t*)next + block_size;
+ const uint64_t* next1 = next0 + block_size;
+ const uint64_t* next2 = next1 + block_size;
+
+ crc1 = crc2 = 0;
+ // Use Duff's device, a for() loop inside a switch()
+ // statement. This needs to execute at least once, round len
+ // down to nearest triplet multiple
+ switch (block_size) {
+ case 128:
+ do {
+ // jumps here for a full block of len 128
+ CRCtriplet(crc, next, -128);
+ FALLTHROUGH_INTENDED;
+ case 127:
+ // jumps here or below for the first block smaller
+ CRCtriplet(crc, next, -127);
+ FALLTHROUGH_INTENDED;
+ case 126:
+ CRCtriplet(crc, next, -126); // than 128
+ FALLTHROUGH_INTENDED;
+ case 125:
+ CRCtriplet(crc, next, -125);
+ FALLTHROUGH_INTENDED;
+ case 124:
+ CRCtriplet(crc, next, -124);
+ FALLTHROUGH_INTENDED;
+ case 123:
+ CRCtriplet(crc, next, -123);
+ FALLTHROUGH_INTENDED;
+ case 122:
+ CRCtriplet(crc, next, -122);
+ FALLTHROUGH_INTENDED;
+ case 121:
+ CRCtriplet(crc, next, -121);
+ FALLTHROUGH_INTENDED;
+ case 120:
+ CRCtriplet(crc, next, -120);
+ FALLTHROUGH_INTENDED;
+ case 119:
+ CRCtriplet(crc, next, -119);
+ FALLTHROUGH_INTENDED;
+ case 118:
+ CRCtriplet(crc, next, -118);
+ FALLTHROUGH_INTENDED;
+ case 117:
+ CRCtriplet(crc, next, -117);
+ FALLTHROUGH_INTENDED;
+ case 116:
+ CRCtriplet(crc, next, -116);
+ FALLTHROUGH_INTENDED;
+ case 115:
+ CRCtriplet(crc, next, -115);
+ FALLTHROUGH_INTENDED;
+ case 114:
+ CRCtriplet(crc, next, -114);
+ FALLTHROUGH_INTENDED;
+ case 113:
+ CRCtriplet(crc, next, -113);
+ FALLTHROUGH_INTENDED;
+ case 112:
+ CRCtriplet(crc, next, -112);
+ FALLTHROUGH_INTENDED;
+ case 111:
+ CRCtriplet(crc, next, -111);
+ FALLTHROUGH_INTENDED;
+ case 110:
+ CRCtriplet(crc, next, -110);
+ FALLTHROUGH_INTENDED;
+ case 109:
+ CRCtriplet(crc, next, -109);
+ FALLTHROUGH_INTENDED;
+ case 108:
+ CRCtriplet(crc, next, -108);
+ FALLTHROUGH_INTENDED;
+ case 107:
+ CRCtriplet(crc, next, -107);
+ FALLTHROUGH_INTENDED;
+ case 106:
+ CRCtriplet(crc, next, -106);
+ FALLTHROUGH_INTENDED;
+ case 105:
+ CRCtriplet(crc, next, -105);
+ FALLTHROUGH_INTENDED;
+ case 104:
+ CRCtriplet(crc, next, -104);
+ FALLTHROUGH_INTENDED;
+ case 103:
+ CRCtriplet(crc, next, -103);
+ FALLTHROUGH_INTENDED;
+ case 102:
+ CRCtriplet(crc, next, -102);
+ FALLTHROUGH_INTENDED;
+ case 101:
+ CRCtriplet(crc, next, -101);
+ FALLTHROUGH_INTENDED;
+ case 100:
+ CRCtriplet(crc, next, -100);
+ FALLTHROUGH_INTENDED;
+ case 99:
+ CRCtriplet(crc, next, -99);
+ FALLTHROUGH_INTENDED;
+ case 98:
+ CRCtriplet(crc, next, -98);
+ FALLTHROUGH_INTENDED;
+ case 97:
+ CRCtriplet(crc, next, -97);
+ FALLTHROUGH_INTENDED;
+ case 96:
+ CRCtriplet(crc, next, -96);
+ FALLTHROUGH_INTENDED;
+ case 95:
+ CRCtriplet(crc, next, -95);
+ FALLTHROUGH_INTENDED;
+ case 94:
+ CRCtriplet(crc, next, -94);
+ FALLTHROUGH_INTENDED;
+ case 93:
+ CRCtriplet(crc, next, -93);
+ FALLTHROUGH_INTENDED;
+ case 92:
+ CRCtriplet(crc, next, -92);
+ FALLTHROUGH_INTENDED;
+ case 91:
+ CRCtriplet(crc, next, -91);
+ FALLTHROUGH_INTENDED;
+ case 90:
+ CRCtriplet(crc, next, -90);
+ FALLTHROUGH_INTENDED;
+ case 89:
+ CRCtriplet(crc, next, -89);
+ FALLTHROUGH_INTENDED;
+ case 88:
+ CRCtriplet(crc, next, -88);
+ FALLTHROUGH_INTENDED;
+ case 87:
+ CRCtriplet(crc, next, -87);
+ FALLTHROUGH_INTENDED;
+ case 86:
+ CRCtriplet(crc, next, -86);
+ FALLTHROUGH_INTENDED;
+ case 85:
+ CRCtriplet(crc, next, -85);
+ FALLTHROUGH_INTENDED;
+ case 84:
+ CRCtriplet(crc, next, -84);
+ FALLTHROUGH_INTENDED;
+ case 83:
+ CRCtriplet(crc, next, -83);
+ FALLTHROUGH_INTENDED;
+ case 82:
+ CRCtriplet(crc, next, -82);
+ FALLTHROUGH_INTENDED;
+ case 81:
+ CRCtriplet(crc, next, -81);
+ FALLTHROUGH_INTENDED;
+ case 80:
+ CRCtriplet(crc, next, -80);
+ FALLTHROUGH_INTENDED;
+ case 79:
+ CRCtriplet(crc, next, -79);
+ FALLTHROUGH_INTENDED;
+ case 78:
+ CRCtriplet(crc, next, -78);
+ FALLTHROUGH_INTENDED;
+ case 77:
+ CRCtriplet(crc, next, -77);
+ FALLTHROUGH_INTENDED;
+ case 76:
+ CRCtriplet(crc, next, -76);
+ FALLTHROUGH_INTENDED;
+ case 75:
+ CRCtriplet(crc, next, -75);
+ FALLTHROUGH_INTENDED;
+ case 74:
+ CRCtriplet(crc, next, -74);
+ FALLTHROUGH_INTENDED;
+ case 73:
+ CRCtriplet(crc, next, -73);
+ FALLTHROUGH_INTENDED;
+ case 72:
+ CRCtriplet(crc, next, -72);
+ FALLTHROUGH_INTENDED;
+ case 71:
+ CRCtriplet(crc, next, -71);
+ FALLTHROUGH_INTENDED;
+ case 70:
+ CRCtriplet(crc, next, -70);
+ FALLTHROUGH_INTENDED;
+ case 69:
+ CRCtriplet(crc, next, -69);
+ FALLTHROUGH_INTENDED;
+ case 68:
+ CRCtriplet(crc, next, -68);
+ FALLTHROUGH_INTENDED;
+ case 67:
+ CRCtriplet(crc, next, -67);
+ FALLTHROUGH_INTENDED;
+ case 66:
+ CRCtriplet(crc, next, -66);
+ FALLTHROUGH_INTENDED;
+ case 65:
+ CRCtriplet(crc, next, -65);
+ FALLTHROUGH_INTENDED;
+ case 64:
+ CRCtriplet(crc, next, -64);
+ FALLTHROUGH_INTENDED;
+ case 63:
+ CRCtriplet(crc, next, -63);
+ FALLTHROUGH_INTENDED;
+ case 62:
+ CRCtriplet(crc, next, -62);
+ FALLTHROUGH_INTENDED;
+ case 61:
+ CRCtriplet(crc, next, -61);
+ FALLTHROUGH_INTENDED;
+ case 60:
+ CRCtriplet(crc, next, -60);
+ FALLTHROUGH_INTENDED;
+ case 59:
+ CRCtriplet(crc, next, -59);
+ FALLTHROUGH_INTENDED;
+ case 58:
+ CRCtriplet(crc, next, -58);
+ FALLTHROUGH_INTENDED;
+ case 57:
+ CRCtriplet(crc, next, -57);
+ FALLTHROUGH_INTENDED;
+ case 56:
+ CRCtriplet(crc, next, -56);
+ FALLTHROUGH_INTENDED;
+ case 55:
+ CRCtriplet(crc, next, -55);
+ FALLTHROUGH_INTENDED;
+ case 54:
+ CRCtriplet(crc, next, -54);
+ FALLTHROUGH_INTENDED;
+ case 53:
+ CRCtriplet(crc, next, -53);
+ FALLTHROUGH_INTENDED;
+ case 52:
+ CRCtriplet(crc, next, -52);
+ FALLTHROUGH_INTENDED;
+ case 51:
+ CRCtriplet(crc, next, -51);
+ FALLTHROUGH_INTENDED;
+ case 50:
+ CRCtriplet(crc, next, -50);
+ FALLTHROUGH_INTENDED;
+ case 49:
+ CRCtriplet(crc, next, -49);
+ FALLTHROUGH_INTENDED;
+ case 48:
+ CRCtriplet(crc, next, -48);
+ FALLTHROUGH_INTENDED;
+ case 47:
+ CRCtriplet(crc, next, -47);
+ FALLTHROUGH_INTENDED;
+ case 46:
+ CRCtriplet(crc, next, -46);
+ FALLTHROUGH_INTENDED;
+ case 45:
+ CRCtriplet(crc, next, -45);
+ FALLTHROUGH_INTENDED;
+ case 44:
+ CRCtriplet(crc, next, -44);
+ FALLTHROUGH_INTENDED;
+ case 43:
+ CRCtriplet(crc, next, -43);
+ FALLTHROUGH_INTENDED;
+ case 42:
+ CRCtriplet(crc, next, -42);
+ FALLTHROUGH_INTENDED;
+ case 41:
+ CRCtriplet(crc, next, -41);
+ FALLTHROUGH_INTENDED;
+ case 40:
+ CRCtriplet(crc, next, -40);
+ FALLTHROUGH_INTENDED;
+ case 39:
+ CRCtriplet(crc, next, -39);
+ FALLTHROUGH_INTENDED;
+ case 38:
+ CRCtriplet(crc, next, -38);
+ FALLTHROUGH_INTENDED;
+ case 37:
+ CRCtriplet(crc, next, -37);
+ FALLTHROUGH_INTENDED;
+ case 36:
+ CRCtriplet(crc, next, -36);
+ FALLTHROUGH_INTENDED;
+ case 35:
+ CRCtriplet(crc, next, -35);
+ FALLTHROUGH_INTENDED;
+ case 34:
+ CRCtriplet(crc, next, -34);
+ FALLTHROUGH_INTENDED;
+ case 33:
+ CRCtriplet(crc, next, -33);
+ FALLTHROUGH_INTENDED;
+ case 32:
+ CRCtriplet(crc, next, -32);
+ FALLTHROUGH_INTENDED;
+ case 31:
+ CRCtriplet(crc, next, -31);
+ FALLTHROUGH_INTENDED;
+ case 30:
+ CRCtriplet(crc, next, -30);
+ FALLTHROUGH_INTENDED;
+ case 29:
+ CRCtriplet(crc, next, -29);
+ FALLTHROUGH_INTENDED;
+ case 28:
+ CRCtriplet(crc, next, -28);
+ FALLTHROUGH_INTENDED;
+ case 27:
+ CRCtriplet(crc, next, -27);
+ FALLTHROUGH_INTENDED;
+ case 26:
+ CRCtriplet(crc, next, -26);
+ FALLTHROUGH_INTENDED;
+ case 25:
+ CRCtriplet(crc, next, -25);
+ FALLTHROUGH_INTENDED;
+ case 24:
+ CRCtriplet(crc, next, -24);
+ FALLTHROUGH_INTENDED;
+ case 23:
+ CRCtriplet(crc, next, -23);
+ FALLTHROUGH_INTENDED;
+ case 22:
+ CRCtriplet(crc, next, -22);
+ FALLTHROUGH_INTENDED;
+ case 21:
+ CRCtriplet(crc, next, -21);
+ FALLTHROUGH_INTENDED;
+ case 20:
+ CRCtriplet(crc, next, -20);
+ FALLTHROUGH_INTENDED;
+ case 19:
+ CRCtriplet(crc, next, -19);
+ FALLTHROUGH_INTENDED;
+ case 18:
+ CRCtriplet(crc, next, -18);
+ FALLTHROUGH_INTENDED;
+ case 17:
+ CRCtriplet(crc, next, -17);
+ FALLTHROUGH_INTENDED;
+ case 16:
+ CRCtriplet(crc, next, -16);
+ FALLTHROUGH_INTENDED;
+ case 15:
+ CRCtriplet(crc, next, -15);
+ FALLTHROUGH_INTENDED;
+ case 14:
+ CRCtriplet(crc, next, -14);
+ FALLTHROUGH_INTENDED;
+ case 13:
+ CRCtriplet(crc, next, -13);
+ FALLTHROUGH_INTENDED;
+ case 12:
+ CRCtriplet(crc, next, -12);
+ FALLTHROUGH_INTENDED;
+ case 11:
+ CRCtriplet(crc, next, -11);
+ FALLTHROUGH_INTENDED;
+ case 10:
+ CRCtriplet(crc, next, -10);
+ FALLTHROUGH_INTENDED;
+ case 9:
+ CRCtriplet(crc, next, -9);
+ FALLTHROUGH_INTENDED;
+ case 8:
+ CRCtriplet(crc, next, -8);
+ FALLTHROUGH_INTENDED;
+ case 7:
+ CRCtriplet(crc, next, -7);
+ FALLTHROUGH_INTENDED;
+ case 6:
+ CRCtriplet(crc, next, -6);
+ FALLTHROUGH_INTENDED;
+ case 5:
+ CRCtriplet(crc, next, -5);
+ FALLTHROUGH_INTENDED;
+ case 4:
+ CRCtriplet(crc, next, -4);
+ FALLTHROUGH_INTENDED;
+ case 3:
+ CRCtriplet(crc, next, -3);
+ FALLTHROUGH_INTENDED;
+ case 2:
+ CRCtriplet(crc, next, -2);
+ FALLTHROUGH_INTENDED;
+ case 1:
+ CRCduplet(crc, next, -1); // the final triplet is actually only 2
+ //{ CombineCRC(); }
+ crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
+ if (--n > 0) {
+ crc1 = crc2 = 0;
+ block_size = 128;
+ // points to the first byte of the next block
+ next0 = next2 + 128;
+ next1 = next0 + 128; // from here on all blocks are 128 long
+ next2 = next1 + 128;
+ }
+ FALLTHROUGH_INTENDED;
+ case 0:;
+ } while (n > 0);
+ }
+ next = (const unsigned char*)next2;
+ }
+ uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
+ len = len & 7;
+ next += (count2 * 8);
+ switch (count2) {
+ case 27:
+ CRCsinglet(crc0, next, -27 * 8);
+ FALLTHROUGH_INTENDED;
+ case 26:
+ CRCsinglet(crc0, next, -26 * 8);
+ FALLTHROUGH_INTENDED;
+ case 25:
+ CRCsinglet(crc0, next, -25 * 8);
+ FALLTHROUGH_INTENDED;
+ case 24:
+ CRCsinglet(crc0, next, -24 * 8);
+ FALLTHROUGH_INTENDED;
+ case 23:
+ CRCsinglet(crc0, next, -23 * 8);
+ FALLTHROUGH_INTENDED;
+ case 22:
+ CRCsinglet(crc0, next, -22 * 8);
+ FALLTHROUGH_INTENDED;
+ case 21:
+ CRCsinglet(crc0, next, -21 * 8);
+ FALLTHROUGH_INTENDED;
+ case 20:
+ CRCsinglet(crc0, next, -20 * 8);
+ FALLTHROUGH_INTENDED;
+ case 19:
+ CRCsinglet(crc0, next, -19 * 8);
+ FALLTHROUGH_INTENDED;
+ case 18:
+ CRCsinglet(crc0, next, -18 * 8);
+ FALLTHROUGH_INTENDED;
+ case 17:
+ CRCsinglet(crc0, next, -17 * 8);
+ FALLTHROUGH_INTENDED;
+ case 16:
+ CRCsinglet(crc0, next, -16 * 8);
+ FALLTHROUGH_INTENDED;
+ case 15:
+ CRCsinglet(crc0, next, -15 * 8);
+ FALLTHROUGH_INTENDED;
+ case 14:
+ CRCsinglet(crc0, next, -14 * 8);
+ FALLTHROUGH_INTENDED;
+ case 13:
+ CRCsinglet(crc0, next, -13 * 8);
+ FALLTHROUGH_INTENDED;
+ case 12:
+ CRCsinglet(crc0, next, -12 * 8);
+ FALLTHROUGH_INTENDED;
+ case 11:
+ CRCsinglet(crc0, next, -11 * 8);
+ FALLTHROUGH_INTENDED;
+ case 10:
+ CRCsinglet(crc0, next, -10 * 8);
+ FALLTHROUGH_INTENDED;
+ case 9:
+ CRCsinglet(crc0, next, -9 * 8);
+ FALLTHROUGH_INTENDED;
+ case 8:
+ CRCsinglet(crc0, next, -8 * 8);
+ FALLTHROUGH_INTENDED;
+ case 7:
+ CRCsinglet(crc0, next, -7 * 8);
+ FALLTHROUGH_INTENDED;
+ case 6:
+ CRCsinglet(crc0, next, -6 * 8);
+ FALLTHROUGH_INTENDED;
+ case 5:
+ CRCsinglet(crc0, next, -5 * 8);
+ FALLTHROUGH_INTENDED;
+ case 4:
+ CRCsinglet(crc0, next, -4 * 8);
+ FALLTHROUGH_INTENDED;
+ case 3:
+ CRCsinglet(crc0, next, -3 * 8);
+ FALLTHROUGH_INTENDED;
+ case 2:
+ CRCsinglet(crc0, next, -2 * 8);
+ FALLTHROUGH_INTENDED;
+ case 1:
+ CRCsinglet(crc0, next, -1 * 8);
+ FALLTHROUGH_INTENDED;
+ case 0:;
+ }
+ }
+ {
+ align_to_8(len, crc0, next);
+ return (uint32_t)crc0 ^ 0xffffffffu;
+ }
+}
+
+#endif //HAVE_SSE42 && HAVE_PCLMUL
+
+static inline Function Choose_Extend() {
+#ifdef HAVE_POWER8
+ return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+ if(crc32c_runtime_check()) {
+ pmull_runtime_flag = crc32c_pmull_runtime_check();
+ return ExtendARMImpl;
+ } else {
+ return ExtendImpl<Slow_CRC32>;
+ }
+#else
+ if (isSSE42()) {
+ if (isPCLMULQDQ()) {
+#if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
+ return crc32c_3way;
+#else
+ return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself
+#endif
+ }
+ else { // no runtime PCLMULQDQ support but has SSE42 support
+ return ExtendImpl<Fast_CRC32>;
+ }
+ } // end of isSSE42()
+ else {
+ return ExtendImpl<Slow_CRC32>;
+ }
+#endif
+}
+
+static Function ChosenExtend = Choose_Extend();
+uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
+ return ChosenExtend(crc, buf, size);
+}
+
+
+} // namespace crc32c
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/crc32c.h b/src/rocksdb/util/crc32c.h
new file mode 100644
index 000000000..a3c7cd1d7
--- /dev/null
+++ b/src/rocksdb/util/crc32c.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace crc32c {
+
+extern std::string IsFastCrc32Supported();
+
+// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
+// crc32c of some string A. Extend() is often used to maintain the
+// crc32c of a stream of data.
+extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
+
+// Return the crc32c of data[0,n-1]
+inline uint32_t Value(const char* data, size_t n) {
+ return Extend(0, data, n);
+}
+
+static const uint32_t kMaskDelta = 0xa282ead8ul;
+
+// Return a masked representation of crc.
+//
+// Motivation: it is problematic to compute the CRC of a string that
+// contains embedded CRCs. Therefore we recommend that CRCs stored
+// somewhere (e.g., in files) should be masked before being stored.
+inline uint32_t Mask(uint32_t crc) {
+ // Rotate right by 15 bits and add a constant.
+ return ((crc >> 15) | (crc << 17)) + kMaskDelta;
+}
+
+// Return the crc whose masked representation is masked_crc.
+inline uint32_t Unmask(uint32_t masked_crc) {
+ uint32_t rot = masked_crc - kMaskDelta;
+ return ((rot >> 17) | (rot << 15));
+}
+
+} // namespace crc32c
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/crc32c_arm64.cc b/src/rocksdb/util/crc32c_arm64.cc
new file mode 100644
index 000000000..566810f4b
--- /dev/null
+++ b/src/rocksdb/util/crc32c_arm64.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "util/crc32c_arm64.h"
+
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+
+#include <asm/hwcap.h>
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+#include <sys/auxv.h>
+#endif
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL (1 << 4)
+#endif
+
+#ifdef HAVE_ARM64_CRYPTO
+/* unfolding to compute 8 * 3 = 24 bytes parallelly */
+#define CRC32C24BYTES(ITR) \
+ crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR))); \
+ crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \
+ crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
+
+/* unfolding to compute 24 * 7 = 168 bytes parallelly */
+#define CRC32C7X24BYTES(ITR) \
+ do { \
+ CRC32C24BYTES((ITR)*7 + 0) \
+ CRC32C24BYTES((ITR)*7 + 1) \
+ CRC32C24BYTES((ITR)*7 + 2) \
+ CRC32C24BYTES((ITR)*7 + 3) \
+ CRC32C24BYTES((ITR)*7 + 4) \
+ CRC32C24BYTES((ITR)*7 + 5) \
+ CRC32C24BYTES((ITR)*7 + 6) \
+ } while (0)
+#endif
+
+extern bool pmull_runtime_flag;
+
+uint32_t crc32c_runtime_check(void) {
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+ uint64_t auxv = getauxval(AT_HWCAP);
+ return (auxv & HWCAP_CRC32) != 0;
+#else
+ return 0;
+#endif
+}
+
+bool crc32c_pmull_runtime_check(void) {
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+ uint64_t auxv = getauxval(AT_HWCAP);
+ return (auxv & HWCAP_PMULL) != 0;
+#else
+ return false;
+#endif
+}
+
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
+ unsigned len) {
+ const uint8_t *buf8;
+ const uint64_t *buf64 = (uint64_t *)data;
+ int length = (int)len;
+ crc ^= 0xffffffff;
+
+ /*
+ * Pmull runtime check here.
+ * Raspberry Pi supports crc32 but doesn't support pmull.
+ * Skip Crc32c Parallel computation if no crypto extension available.
+ */
+ if (pmull_runtime_flag) {
+/* Macro (HAVE_ARM64_CRYPTO) is used for compiling check */
+#ifdef HAVE_ARM64_CRYPTO
+/* Crc32c Parallel computation
+ * Algorithm comes from Intel whitepaper:
+ * crc-iscsi-polynomial-crc32-instruction-paper
+ *
+ * Input data is divided into three equal-sized blocks
+ * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+ * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+ */
+#define BLK_LENGTH 42
+ while (length >= 1024) {
+ uint64_t t0, t1;
+ uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
+
+ /* Parallel Param:
+ * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+ * k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+ */
+ uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+ /* Prefetch data for following block to avoid cache miss */
+ PREF1KL1((uint8_t *)buf64, 1024);
+
+ /* First 8 byte for better pipelining */
+ crc0 = crc32c_u64(crc, *buf64++);
+
+ /* 3 blocks crc32c parallel computation
+ * Macro unfolding to compute parallelly
+ * 168 * 6 = 1008 (bytes)
+ */
+ CRC32C7X24BYTES(0);
+ CRC32C7X24BYTES(1);
+ CRC32C7X24BYTES(2);
+ CRC32C7X24BYTES(3);
+ CRC32C7X24BYTES(4);
+ CRC32C7X24BYTES(5);
+ buf64 += (BLK_LENGTH * 3);
+
+ /* Last 8 bytes */
+ crc = crc32c_u64(crc2, *buf64++);
+
+ t0 = (uint64_t)vmull_p64(crc0, k0);
+ t1 = (uint64_t)vmull_p64(crc1, k1);
+
+ /* Merge (crc0, crc1, crc2) -> crc */
+ crc1 = crc32c_u64(0, t1);
+ crc ^= crc1;
+ crc0 = crc32c_u64(0, t0);
+ crc ^= crc0;
+
+ length -= 1024;
+ }
+
+ if (length == 0) return crc ^ (0xffffffffU);
+#endif
+ } // if Pmull runtime check here
+
+ buf8 = (const uint8_t *)buf64;
+ while (length >= 8) {
+ crc = crc32c_u64(crc, *(const uint64_t *)buf8);
+ buf8 += 8;
+ length -= 8;
+ }
+
+ /* The following is more efficient than the straight loop */
+ if (length >= 4) {
+ crc = crc32c_u32(crc, *(const uint32_t *)buf8);
+ buf8 += 4;
+ length -= 4;
+ }
+
+ if (length >= 2) {
+ crc = crc32c_u16(crc, *(const uint16_t *)buf8);
+ buf8 += 2;
+ length -= 2;
+ }
+
+ if (length >= 1) crc = crc32c_u8(crc, *buf8);
+
+ crc ^= 0xffffffff;
+ return crc;
+}
+
+#endif
diff --git a/src/rocksdb/util/crc32c_arm64.h b/src/rocksdb/util/crc32c_arm64.h
new file mode 100644
index 000000000..a12354683
--- /dev/null
+++ b/src/rocksdb/util/crc32c_arm64.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef UTIL_CRC32C_ARM64_H
+#define UTIL_CRC32C_ARM64_H
+
+#include <cinttypes>
+
+#if defined(__aarch64__) || defined(__AARCH64__)
+
+#ifdef __ARM_FEATURE_CRC32
+#define HAVE_ARM64_CRC
+#include <arm_acle.h>
+#define crc32c_u8(crc, v) __crc32cb(crc, v)
+#define crc32c_u16(crc, v) __crc32ch(crc, v)
+#define crc32c_u32(crc, v) __crc32cw(crc, v)
+#define crc32c_u64(crc, v) __crc32cd(crc, v)
+#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+ [c] "I"((PREF_OFFSET) + ((ITR) + 0) * 64)); \
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+ [c] "I"((PREF_OFFSET) + ((ITR) + 1) * 64)); \
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+ [c] "I"((PREF_OFFSET) + ((ITR) + 2) * 64)); \
+ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+ [c] "I"((PREF_OFFSET) + ((ITR) + 3) * 64));
+
+#define PREF1KL1(buffer, PREF_OFFSET) \
+ PREF4X64L1(buffer, (PREF_OFFSET), 0) \
+ PREF4X64L1(buffer, (PREF_OFFSET), 4) \
+ PREF4X64L1(buffer, (PREF_OFFSET), 8) \
+ PREF4X64L1(buffer, (PREF_OFFSET), 12)
+
+extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
+extern uint32_t crc32c_runtime_check(void);
+extern bool crc32c_pmull_runtime_check(void);
+
+#ifdef __ARM_FEATURE_CRYPTO
+#define HAVE_ARM64_CRYPTO
+#include <arm_neon.h>
+#endif // __ARM_FEATURE_CRYPTO
+#endif // __ARM_FEATURE_CRC32
+
+#endif // defined(__aarch64__) || defined(__AARCH64__)
+
+#endif
diff --git a/src/rocksdb/util/crc32c_ppc.c b/src/rocksdb/util/crc32c_ppc.c
new file mode 100644
index 000000000..888a4943e
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc.c
@@ -0,0 +1,94 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2017 International Business Machines Corp.
+// All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#define CRC_TABLE
+#include <stdint.h>
+#include <stdlib.h>
+#include <strings.h>
+#include "util/crc32c_ppc_constants.h"
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN - 1)
+
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+ unsigned long len) {
+ while (len--) crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+ return crc;
+}
+#endif
+
+#ifdef HAVE_POWER8
+unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
+ unsigned long len);
+
+static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
+ unsigned len) {
+ unsigned int prealign;
+ unsigned int tail;
+
+#ifdef CRC_XOR
+ crc ^= 0xffffffff;
+#endif
+
+ if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+ crc = crc32_align(crc, data, (unsigned long)len);
+ goto out;
+ }
+
+ if ((unsigned long)data & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK);
+ crc = crc32_align(crc, data, prealign);
+ len -= prealign;
+ data += prealign;
+ }
+
+ crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK);
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ data += len & ~VMX_ALIGN_MASK;
+ crc = crc32_align(crc, data, tail);
+ }
+
+out:
+#ifdef CRC_XOR
+ crc ^= 0xffffffff;
+#endif
+
+ return crc;
+}
+
+/* This wrapper function works around the fact that crc32_vpmsum
+ * does not gracefully handle the case where the data pointer is NULL. There
+ * may be room for performance improvement here.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+ unsigned char *buf2;
+
+ if (!data) {
+ buf2 = (unsigned char *)malloc(len);
+ bzero(buf2, len);
+ crc = crc32_vpmsum(crc, buf2, len);
+ free(buf2);
+ } else {
+ crc = crc32_vpmsum(crc, data, (unsigned long)len);
+ }
+ return crc;
+}
+
+#else /* HAVE_POWER8 */
+
+/* This symbol has to exist on non-ppc architectures (and on legacy
+ * ppc systems using power7 or below) in order to compile properly
+ * there, even though it won't be called.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+ return 0;
+}
+
+#endif /* HAVE_POWER8 */
diff --git a/src/rocksdb/util/crc32c_ppc.h b/src/rocksdb/util/crc32c_ppc.h
new file mode 100644
index 000000000..c359061c6
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc.h
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2017 International Business Machines Corp.
+// All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
+ unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/rocksdb/util/crc32c_ppc_asm.S b/src/rocksdb/util/crc32c_ppc_asm.S
new file mode 100644
index 000000000..a317bf96b
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc_asm.S
@@ -0,0 +1,752 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+// Copyright (c) 2017 International Business Machines Corp.
+// All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <ppc-asm.h>
+#include "ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+#define __ASSEMBLY__
+#include "crc32c_ppc_constants.h"
+
+ .text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16 r25
+#define off32 r26
+#define off48 r27
+#define off64 r28
+#define off80 r29
+#define off96 r30
+#define off112 r31
+
+#define const1 v24
+#define const2 v25
+
+#define byteswap v26
+#define mask_32bit v27
+#define mask_64bit v28
+#define zeroes v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(__crc32_vpmsum)
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+ std r26,-48(r1)
+ std r25,-56(r1)
+
+ li off16,16
+ li off32,32
+ li off48,48
+ li off64,64
+ li off80,80
+ li off96,96
+ li off112,112
+ li r0,0
+
+ /* Enough room for saving 10 non volatile VMX registers */
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ stvx v20,0,r6
+ stvx v21,off16,r6
+ stvx v22,off32,r6
+ stvx v23,off48,r6
+ stvx v24,off64,r6
+ stvx v25,off80,r6
+ stvx v26,off96,r6
+ stvx v27,off112,r6
+ stvx v28,0,r7
+ stvx v29,off16,r7
+
+ mr r10,r3
+
+ vxor zeroes,zeroes,zeroes
+ vspltisw v0,-1
+
+ vsldoi mask_32bit,zeroes,v0,4
+ vsldoi mask_64bit,zeroes,v0,8
+
+ /* Get the initial value into v8 */
+ vxor v8,v8,v8
+ MTVRD(v8, r3)
+#ifdef REFLECT
+ vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
+#else
+ vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+ addis r3,r2,.byteswap_constant@toc@ha
+ addi r3,r3,.byteswap_constant@toc@l
+
+ lvx byteswap,0,r3
+ addi r3,r3,16
+#endif
+
+ cmpdi r5,256
+ blt .Lshort
+
+ rldicr r6,r5,0,56
+
+ /* Checksum in blocks of MAX_SIZE */
+1: lis r7,MAX_SIZE@h
+ ori r7,r7,MAX_SIZE@l
+ mr r9,r7
+ cmpd r6,r7
+ bgt 2f
+ mr r7,r6
+2: subf r6,r7,r6
+
+ /* our main loop does 128 bytes at a time */
+ srdi r7,r7,7
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ sldi r8,r7,4
+ srdi r9,r9,3
+ subf r8,r8,r9
+
+ /* We reduce our final 128 bytes in a separate step */
+ addi r7,r7,-1
+ mtctr r7
+
+ addis r3,r2,.constants@toc@ha
+ addi r3,r3,.constants@toc@l
+
+ /* Find the start of our constants */
+ add r3,r3,r8
+
+ /* zero v0-v7 which will contain our checksums */
+ vxor v0,v0,v0
+ vxor v1,v1,v1
+ vxor v2,v2,v2
+ vxor v3,v3,v3
+ vxor v4,v4,v4
+ vxor v5,v5,v5
+ vxor v6,v6,v6
+ vxor v7,v7,v7
+
+ lvx const1,0,r3
+
+ /*
+ * If we are looping back to consume more data we use the values
+ * already in v16-v23.
+ */
+ cmpdi r0,1
+ beq 2f
+
+ /* First warm up pass */
+ lvx v16,0,r4
+ lvx v17,off16,r4
+ VPERM(v16,v16,v16,byteswap)
+ VPERM(v17,v17,v17,byteswap)
+ lvx v18,off32,r4
+ lvx v19,off48,r4
+ VPERM(v18,v18,v18,byteswap)
+ VPERM(v19,v19,v19,byteswap)
+ lvx v20,off64,r4
+ lvx v21,off80,r4
+ VPERM(v20,v20,v20,byteswap)
+ VPERM(v21,v21,v21,byteswap)
+ lvx v22,off96,r4
+ lvx v23,off112,r4
+ VPERM(v22,v22,v22,byteswap)
+ VPERM(v23,v23,v23,byteswap)
+ addi r4,r4,8*16
+
+ /* xor in initial value */
+ vxor v16,v16,v8
+
+2: bdz .Lfirst_warm_up_done
+
+ addi r3,r3,16
+ lvx const2,0,r3
+
+ /* Second warm up pass */
+ VPMSUMD(v8,v16,const1)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v9,v17,const1)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v10,v18,const1)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v11,v19,const1)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdz .Lfirst_cool_down
+
+ /*
+ * main loop. We modulo schedule it such that it takes three iterations
+ * to complete - first iteration load, second iteration vpmsum, third
+ * iteration xor.
+ */
+ .balign 16
+4: lvx const1,0,r3
+ addi r3,r3,16
+ ori r2,r2,0
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const2)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const2)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const2)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const2)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ lvx const2,0,r3
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdnz 4b
+
+.Lfirst_cool_down:
+ /* First cool down pass */
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const1)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const1)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const1)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const1)
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ ori r2,r2,0
+
+.Lsecond_cool_down:
+ /* Second cool down pass */
+ vxor v0,v0,v8
+ vxor v1,v1,v9
+ vxor v2,v2,v10
+ vxor v3,v3,v11
+ vxor v4,v4,v12
+ vxor v5,v5,v13
+ vxor v6,v6,v14
+ vxor v7,v7,v15
+
+#ifdef REFLECT
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ vsldoi v0,v0,zeroes,4
+ vsldoi v1,v1,zeroes,4
+ vsldoi v2,v2,zeroes,4
+ vsldoi v3,v3,zeroes,4
+ vsldoi v4,v4,zeroes,4
+ vsldoi v5,v5,zeroes,4
+ vsldoi v6,v6,zeroes,4
+ vsldoi v7,v7,zeroes,4
+#endif
+
+ /* xor with last 1024 bits */
+ lvx v8,0,r4
+ lvx v9,off16,r4
+ VPERM(v8,v8,v8,byteswap)
+ VPERM(v9,v9,v9,byteswap)
+ lvx v10,off32,r4
+ lvx v11,off48,r4
+ VPERM(v10,v10,v10,byteswap)
+ VPERM(v11,v11,v11,byteswap)
+ lvx v12,off64,r4
+ lvx v13,off80,r4
+ VPERM(v12,v12,v12,byteswap)
+ VPERM(v13,v13,v13,byteswap)
+ lvx v14,off96,r4
+ lvx v15,off112,r4
+ VPERM(v14,v14,v14,byteswap)
+ VPERM(v15,v15,v15,byteswap)
+
+ addi r4,r4,8*16
+
+ vxor v16,v0,v8
+ vxor v17,v1,v9
+ vxor v18,v2,v10
+ vxor v19,v3,v11
+ vxor v20,v4,v12
+ vxor v21,v5,v13
+ vxor v22,v6,v14
+ vxor v23,v7,v15
+
+ li r0,1
+ cmpdi r6,0
+ addi r6,r6,128
+ bne 1b
+
+ /* Work out how many bytes we have left */
+ andi. r5,r5,127
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,128
+ add r3,r3,r6
+
+ /* How many 16 byte chunks are in the tail */
+ srdi r7,r5,4
+ mtctr r7
+
+ /*
+ * Reduce the previously calculated 1024 bits to 64 bits, shifting
+ * 32 bits to include the trailing 32 bits of zeros
+ */
+ lvx v0,0,r3
+ lvx v1,off16,r3
+ lvx v2,off32,r3
+ lvx v3,off48,r3
+ lvx v4,off64,r3
+ lvx v5,off80,r3
+ lvx v6,off96,r3
+ lvx v7,off112,r3
+ addi r3,r3,8*16
+
+ VPMSUMW(v0,v16,v0)
+ VPMSUMW(v1,v17,v1)
+ VPMSUMW(v2,v18,v2)
+ VPMSUMW(v3,v19,v3)
+ VPMSUMW(v4,v20,v4)
+ VPMSUMW(v5,v21,v5)
+ VPMSUMW(v6,v22,v6)
+ VPMSUMW(v7,v23,v7)
+
+ /* Now reduce the tail (0 - 112 bytes) */
+ cmpdi r7,0
+ beq 1f
+
+ lvx v16,0,r4
+ lvx v17,0,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off16,r4
+ lvx v17,off16,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off32,r4
+ lvx v17,off32,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off48,r4
+ lvx v17,off48,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off64,r4
+ lvx v17,off64,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off80,r4
+ lvx v17,off80,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off96,r4
+ lvx v17,off96,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+
+ /* Now xor all the parallel chunks together */
+1: vxor v0,v0,v1
+ vxor v2,v2,v3
+ vxor v4,v4,v5
+ vxor v6,v6,v7
+
+ vxor v0,v0,v2
+ vxor v4,v4,v6
+
+ vxor v0,v0,v4
+
+.Lbarrett_reduction:
+ /* Barrett constants */
+ addis r3,r2,.barrett_constants@toc@ha
+ addi r3,r3,.barrett_constants@toc@l
+
+ lvx const1,0,r3
+ lvx const2,off16,r3
+
+ vsldoi v1,v0,v0,8
+ vxor v0,v0,v1 /* xor two 64 bit results together */
+
+#ifdef REFLECT
+ /* shift left one bit */
+ vspltisb v1,1
+ vsl v0,v0,v1
+#endif
+
+ vand v0,v0,mask_64bit
+
+#ifndef REFLECT
+ /*
+ * Now for the Barrett reduction algorithm. The idea is to calculate q,
+ * the multiple of our polynomial that we need to subtract. By
+ * doing the computation 2x bits higher (ie 64 bits) and shifting the
+ * result back down 2x bits, we round down to the nearest multiple.
+ */
+ VPMSUMD(v1,v0,const1) /* ma */
+ vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Get the result into r3. We need to shift it left 8 bytes:
+ * V0 [ 0 1 2 X ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
+#else
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand v1,v0,mask_32bit /* bottom 32 bits of a */
+ VPMSUMD(v1,v1,const1) /* ma */
+ vand v1,v1,mask_32bit /* bottom 32bits of ma */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
+#endif
+
+ /* Get it into r3 */
+ MFVRD(r3, v0)
+
+.Lout:
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ lvx v20,0,r6
+ lvx v21,off16,r6
+ lvx v22,off32,r6
+ lvx v23,off48,r6
+ lvx v24,off64,r6
+ lvx v25,off80,r6
+ lvx v26,off96,r6
+ lvx v27,off112,r6
+ lvx v28,0,r7
+ lvx v29,off16,r7
+
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ ld r26,-48(r1)
+ ld r25,-56(r1)
+
+ blr
+
+.Lfirst_warm_up_done:
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ VPMSUMD(v8,v16,const1)
+ VPMSUMD(v9,v17,const1)
+ VPMSUMD(v10,v18,const1)
+ VPMSUMD(v11,v19,const1)
+ VPMSUMD(v12,v20,const1)
+ VPMSUMD(v13,v21,const1)
+ VPMSUMD(v14,v22,const1)
+ VPMSUMD(v15,v23,const1)
+
+ b .Lsecond_cool_down
+
+.Lshort:
+ cmpdi r5,0
+ beq .Lzero
+
+ addis r3,r2,.short_constants@toc@ha
+ addi r3,r3,.short_constants@toc@l
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,256
+ add r3,r3,r6
+
+ /* How many 16 byte chunks? */
+ srdi r7,r5,4
+ mtctr r7
+
+ vxor v19,v19,v19
+ vxor v20,v20,v20
+
+ lvx v0,0,r4
+ lvx v16,0,r3
+ VPERM(v0,v0,v16,byteswap)
+ vxor v0,v0,v8 /* xor in initial value */
+ VPMSUMW(v0,v0,v16)
+ bdz .Lv0
+
+ lvx v1,off16,r4
+ lvx v17,off16,r3
+ VPERM(v1,v1,v17,byteswap)
+ VPMSUMW(v1,v1,v17)
+ bdz .Lv1
+
+ lvx v2,off32,r4
+ lvx v16,off32,r3
+ VPERM(v2,v2,v16,byteswap)
+ VPMSUMW(v2,v2,v16)
+ bdz .Lv2
+
+ lvx v3,off48,r4
+ lvx v17,off48,r3
+ VPERM(v3,v3,v17,byteswap)
+ VPMSUMW(v3,v3,v17)
+ bdz .Lv3
+
+ lvx v4,off64,r4
+ lvx v16,off64,r3
+ VPERM(v4,v4,v16,byteswap)
+ VPMSUMW(v4,v4,v16)
+ bdz .Lv4
+
+ lvx v5,off80,r4
+ lvx v17,off80,r3
+ VPERM(v5,v5,v17,byteswap)
+ VPMSUMW(v5,v5,v17)
+ bdz .Lv5
+
+ lvx v6,off96,r4
+ lvx v16,off96,r3
+ VPERM(v6,v6,v16,byteswap)
+ VPMSUMW(v6,v6,v16)
+ bdz .Lv6
+
+ lvx v7,off112,r4
+ lvx v17,off112,r3
+ VPERM(v7,v7,v17,byteswap)
+ VPMSUMW(v7,v7,v17)
+ bdz .Lv7
+
+ addi r3,r3,128
+ addi r4,r4,128
+
+ lvx v8,0,r4
+ lvx v16,0,r3
+ VPERM(v8,v8,v16,byteswap)
+ VPMSUMW(v8,v8,v16)
+ bdz .Lv8
+
+ lvx v9,off16,r4
+ lvx v17,off16,r3
+ VPERM(v9,v9,v17,byteswap)
+ VPMSUMW(v9,v9,v17)
+ bdz .Lv9
+
+ lvx v10,off32,r4
+ lvx v16,off32,r3
+ VPERM(v10,v10,v16,byteswap)
+ VPMSUMW(v10,v10,v16)
+ bdz .Lv10
+
+ lvx v11,off48,r4
+ lvx v17,off48,r3
+ VPERM(v11,v11,v17,byteswap)
+ VPMSUMW(v11,v11,v17)
+ bdz .Lv11
+
+ lvx v12,off64,r4
+ lvx v16,off64,r3
+ VPERM(v12,v12,v16,byteswap)
+ VPMSUMW(v12,v12,v16)
+ bdz .Lv12
+
+ lvx v13,off80,r4
+ lvx v17,off80,r3
+ VPERM(v13,v13,v17,byteswap)
+ VPMSUMW(v13,v13,v17)
+ bdz .Lv13
+
+ lvx v14,off96,r4
+ lvx v16,off96,r3
+ VPERM(v14,v14,v16,byteswap)
+ VPMSUMW(v14,v14,v16)
+ bdz .Lv14
+
+ lvx v15,off112,r4
+ lvx v17,off112,r3
+ VPERM(v15,v15,v17,byteswap)
+ VPMSUMW(v15,v15,v17)
+
+.Lv15: vxor v19,v19,v15
+.Lv14: vxor v20,v20,v14
+.Lv13: vxor v19,v19,v13
+.Lv12: vxor v20,v20,v12
+.Lv11: vxor v19,v19,v11
+.Lv10: vxor v20,v20,v10
+.Lv9: vxor v19,v19,v9
+.Lv8: vxor v20,v20,v8
+.Lv7: vxor v19,v19,v7
+.Lv6: vxor v20,v20,v6
+.Lv5: vxor v19,v19,v5
+.Lv4: vxor v20,v20,v4
+.Lv3: vxor v19,v19,v3
+.Lv2: vxor v20,v20,v2
+.Lv1: vxor v19,v19,v1
+.Lv0: vxor v20,v20,v0
+
+ vxor v0,v19,v20
+
+ b .Lbarrett_reduction
+
+.Lzero:
+ mr r3,r10
+ b .Lout
+
+FUNC_END(__crc32_vpmsum)
diff --git a/src/rocksdb/util/crc32c_ppc_constants.h b/src/rocksdb/util/crc32c_ppc_constants.h
new file mode 100644
index 000000000..f6494cd01
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc_constants.h
@@ -0,0 +1,900 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (C) 2015, 2017 International Business Machines Corp.
+// All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#define CRC 0x1edc6f41
+#define REFLECT
+#define CRC_XOR
+
+#ifndef __ASSEMBLY__
+#ifdef CRC_TABLE
+static const unsigned int crc_table[] = {
+ 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
+ 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+ 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c,
+ 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+ 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc,
+ 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+ 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512,
+ 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+ 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad,
+ 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+ 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf,
+ 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+ 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f,
+ 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+ 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f,
+ 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+ 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e,
+ 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+ 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e,
+ 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+ 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de,
+ 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+ 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4,
+ 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+ 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b,
+ 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+ 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5,
+ 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+ 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975,
+ 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+ 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905,
+ 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+ 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8,
+ 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+ 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8,
+ 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+ 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78,
+ 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+ 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6,
+ 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+ 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69,
+ 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+ 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
+};
+
+#endif
+
+#else
+#define MAX_SIZE 32768
+.constants :
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ .octa 0x00000000b6ca9e20000000009c37c408
+
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ .octa 0x00000000350249a800000001b51df26c
+
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ .octa 0x00000001862dac54000000000724b9d0
+
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ .octa 0x00000001d87fb48c00000001c00532fe
+
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ .octa 0x00000001f39b699e00000000f05a9362
+
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ .octa 0x0000000101da11b400000001e1007970
+
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ .octa 0x00000001cab571e000000000a57366ee
+
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ .octa 0x00000000c7020cfe0000000192011284
+
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ .octa 0x00000000cdaed1ae0000000162716d9a
+
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ .octa 0x00000001e804effc00000000cd97ecde
+
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ .octa 0x0000000077c3ea3a0000000058812bc0
+
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ .octa 0x0000000068df31b40000000088b8c12e
+
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ .octa 0x00000000b059b6c200000001230b234c
+
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ .octa 0x0000000145fb8ed800000001120b416e
+
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ .octa 0x00000000cbc0916800000001974aecb0
+
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ .octa 0x000000005ceeedc2000000008ee3f226
+
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ .octa 0x0000000047d74e8600000001089aba9a
+
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ .octa 0x00000001407e9e220000000065113872
+
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ .octa 0x00000001da967bda000000005c07ec10
+
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ .octa 0x000000006c8983680000000187590924
+
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ .octa 0x00000000f2d14c9800000000e35da7c6
+
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ .octa 0x00000001993c6ad4000000000415855a
+
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ .octa 0x000000014683d1ac0000000073617758
+
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ .octa 0x00000001a7c93e6c0000000176021d28
+
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ .octa 0x000000010211e90a00000001c358fd0a
+
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ .octa 0x000000001119403e00000001ff7a2c18
+
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ .octa 0x000000001c3261aa00000000f2d9f7e4
+
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ .octa 0x000000014e37a634000000016cf1f9c8
+
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ .octa 0x0000000073786c0c000000010af9279a
+
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ .octa 0x000000011dc037f80000000004f101e8
+
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ .octa 0x0000000031433dfc0000000070bcf184
+
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ .octa 0x000000009cde8348000000000a8de642
+
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ .octa 0x0000000038d3c2a60000000062ea130c
+
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ .octa 0x000000011b25f26000000001eb31cbb2
+
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ .octa 0x000000001629e6f00000000170783448
+
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ .octa 0x0000000160838b4c00000001a684b4c6
+
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ .octa 0x000000007a44011c00000000253ca5b4
+
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ .octa 0x00000000226f417a0000000057b4b1e2
+
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ .octa 0x0000000045eb2eb400000000b6bd084c
+
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ .octa 0x000000014459d70c0000000123c2d592
+
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ .octa 0x00000001d406ed8200000000159dafce
+
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ .octa 0x0000000160c8e1a80000000127e1a64e
+
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ .octa 0x0000000027ba80980000000056860754
+
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ .octa 0x000000006d92d01800000001e661aae8
+
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ .octa 0x000000012ed7e3f200000000f82c6166
+
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ .octa 0x000000002dc8778800000000c4f9c7ae
+
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ .octa 0x0000000018240bb80000000074203d20
+
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ .octa 0x000000001ad381580000000198173052
+
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ .octa 0x00000001396b78f200000001ce8aba54
+
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ .octa 0x000000011a68133400000001850d5d94
+
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ .octa 0x000000012104732e00000001d609239c
+
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ .octa 0x00000000a140d90c000000001595f048
+
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ .octa 0x00000001b7215eda0000000042ccee08
+
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ .octa 0x00000001aaf1df3c000000010a389d74
+
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ .octa 0x0000000029d15b8a000000012a840da6
+
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ .octa 0x00000000f1a96922000000001d181c0c
+
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ .octa 0x00000001ac80d03c0000000068b7d1f6
+
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ .octa 0x000000000f11d56a000000005b0f14fc
+
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ .octa 0x00000001f1c022a20000000179e9e730
+
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ .octa 0x0000000173d00ae200000001ce1368d6
+
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ .octa 0x000000016edc5ae400000000de940fee
+
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ .octa 0x00000001f1a0214000000000fe896b7e
+
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ .octa 0x00000000ca0b28a000000001f797431c
+
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ .octa 0x00000001928e30a20000000053e989ba
+
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ .octa 0x0000000097b1b002000000003920cd16
+
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ .octa 0x00000000b15bf90600000001e6f579b8
+
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ .octa 0x00000000411c5d52000000007493cb0a
+
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ .octa 0x00000001c36f330000000001bdd376d8
+
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ .octa 0x00000001119227e0000000016badfee6
+
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ .octa 0x00000000114d47020000000071de5c58
+
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ .octa 0x00000000458b5b9800000000453f317c
+
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ .octa 0x000000012e31fb8e0000000121675cce
+
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ .octa 0x000000005cf619d800000001f409ee92
+
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ .octa 0x0000000063f4d8b200000000f36b9c88
+
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ .octa 0x000000004138dc8a0000000036b398f4
+
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ .octa 0x00000001d29ee8e000000001748f9adc
+
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ .octa 0x000000006a08ace800000001be94ec00
+
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ .octa 0x0000000127d4201000000000b74370d6
+
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ .octa 0x0000000019d76b6200000001174d0b98
+
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ .octa 0x00000001b1471f6e00000000befc06a4
+
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ .octa 0x00000001f64c19cc00000001ae125288
+
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ .octa 0x00000000003c0ea00000000095c19b34
+
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ .octa 0x000000014d73abf600000001a78496f2
+
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ .octa 0x00000001620eb84400000001ac5390a0
+
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ .octa 0x0000000147655048000000002a80ed6e
+
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ .octa 0x0000000067b5077e00000001fa9b0128
+
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ .octa 0x0000000010ffe20600000001ea94929e
+
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ .octa 0x000000000fee8f1e0000000125f4305c
+
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ .octa 0x00000001da26fbae00000001471e2002
+
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ .octa 0x00000001b3a8bd880000000132d2253a
+
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ .octa 0x00000000e8f3898e00000000f26b3592
+
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ .octa 0x0000000030f2a798000000013a826ef2
+
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ .octa 0x000000000fba10020000000081482c84
+
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ .octa 0x00000000bdb9bd7200000000e77307c2
+
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ .octa 0x00000000ef1f98a00000000017102100
+
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ .octa 0x00000000689c760200000000db406486
+
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ .octa 0x000000016d5fa5fe0000000192db7f88
+
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ .octa 0x0000000041e7b470000000007c09163e
+
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ .octa 0x00000001cbb6495e000000000adac060
+
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ .octa 0x000000010052a0b000000000bd8316ae
+
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ .octa 0x00000001d8effb5c000000019f09ab54
+
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ .octa 0x00000001d969853c0000000125155542
+
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ .octa 0x00000000523ccce2000000018fdb5882
+
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ .octa 0x000000001e2436bc00000000e794b3f4
+
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ .octa 0x00000000ddd1c3a2000000016f9bb022
+
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ .octa 0x0000000019fcfe3800000000290c9978
+
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ .octa 0x00000001ce95db640000000083c0f350
+
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ .octa 0x00000000af5828060000000173ea6628
+
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ .octa 0x00000001006388f600000001c8b4e00a
+
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ .octa 0x0000000179eca00a00000000de95d6aa
+
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ .octa 0x0000000122410a6a000000010b7f7248
+
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ .octa 0x000000004288e87c00000001326e3a06
+
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ .octa 0x000000016c5490da00000000bb62c2e6
+
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ .octa 0x00000001b4ce08a6000000011dfe763a
+
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ .octa 0x00000001466ba60c000000007bcca8e2
+
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ .octa 0x00000001f6c488a40000000186118faa
+
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ .octa 0x000000013bfb06820000000111a65a88
+
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ .octa 0x00000000690e9e54000000003565e1c4
+
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ .octa 0x00000000281346b6000000012ed02a82
+
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ .octa 0x000000015646402400000000c486ecfc
+
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ .octa 0x000000016063a8dc0000000001b951b2
+
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ .octa 0x0000000116a663620000000048143916
+
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ .octa 0x000000017e8aa4d200000001dc2ae124
+
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ .octa 0x00000001728eb10c00000001416c58d6
+
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ .octa 0x00000001b08fd7fa00000000a479744a
+
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ .octa 0x00000001092a16e80000000096ca3a26
+
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ .octa 0x00000000a505637c00000000ff223d4e
+
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ .octa 0x00000000d94869b2000000010e84da42
+
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ .octa 0x00000001c8b203ae00000001b61ba3d0
+
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ .octa 0x000000005704aea000000000680f2de8
+
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ .octa 0x000000012e295fa2000000008772a9a8
+
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ .octa 0x000000011d0908bc0000000155f295bc
+
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ .octa 0x0000000193ed97ea00000000595f9282
+
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ .octa 0x000000013a0f1c520000000164b1c25a
+
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ .octa 0x000000010c2c40c000000000fbd67c50
+
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ .octa 0x00000000ff6fac3e0000000096076268
+
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ .octa 0x000000017b3609c000000001d288e4cc
+
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ .octa 0x0000000088c8c92200000001eaac1bdc
+
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ .octa 0x00000001751baae600000001f1ea39e2
+
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ .octa 0x000000010795297200000001eb6506fc
+
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ .octa 0x0000000162b00abe000000010f806ffe
+
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ .octa 0x000000000d7b404c000000010408481e
+
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ .octa 0x00000000763b13d40000000188260534
+
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ .octa 0x00000000f6dc22d80000000058fc73e0
+
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ .octa 0x000000007daae06000000000391c59b8
+
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ .octa 0x000000013359ab7c000000018b638400
+
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ .octa 0x000000008add438a000000011738f5c4
+
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ .octa 0x00000001edbefdea000000008cf7c6da
+
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ .octa 0x000000004104e0f800000001ef97fb16
+
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ .octa 0x00000000b48a82220000000102130e20
+
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ .octa 0x00000001bcb4684400000000db968898
+
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ .octa 0x000000013293ce0a00000000b5047b5e
+
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ .octa 0x00000001710d0844000000010b90fdb2
+
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ .octa 0x0000000117907f6e000000004834a32e
+
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ .octa 0x0000000087ddf93e0000000059c8f2b0
+
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ .octa 0x000000005970e9b00000000122cec508
+
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ .octa 0x0000000185b2b7d0000000000a330cda
+
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ .octa 0x00000001dcee0efc000000014a47148c
+
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ .octa 0x0000000030da27220000000042c61cb8
+
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ .octa 0x000000012f925a180000000012fe6960
+
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ .octa 0x00000000dd2e357c00000000dbda2c20
+
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ .octa 0x00000000071c80de000000011122410c
+
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ .octa 0x000000011513140a00000000977b2070
+
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ .octa 0x00000001df876e8e000000014050438e
+
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ .octa 0x000000015f81d6ce0000000147c840e8
+
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ .octa 0x000000019dd94dbe00000001cc7c88ce
+
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ .octa 0x00000001373d206e00000001476b35a4
+
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ .octa 0x00000000668ccade000000013d52d508
+
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ .octa 0x00000001b192d268000000008e4be32e
+
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ .octa 0x00000000e30f3a7800000000024120fe
+
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ .octa 0x000000010ef1f7bc00000000ddecddb4
+
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ .octa 0x00000001f5ac738000000000d4d403bc
+
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ .octa 0x000000011822ea7000000001734b89aa
+
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ .octa 0x00000000c3a33848000000010e7a58d6
+
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ .octa 0x00000001bd151c2400000001f9f04e9c
+
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ .octa 0x0000000056002d7600000000b692225e
+
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ .octa 0x000000014657c4f4000000019b8d3f3e
+
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ .octa 0x0000000113742d7c00000001a874f11e
+
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ .octa 0x000000019c5920ba000000010d5a4254
+
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ .octa 0x000000005216d2d600000000bbb2f5d6
+
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ .octa 0x0000000136f5ad8a0000000179cc0e36
+
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ .octa 0x000000018b07beb600000001dca1da4a
+
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ .octa 0x00000000db1e93b000000000feb1a192
+
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ .octa 0x000000000b96fa3a00000000d1eeedd6
+
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ .octa 0x00000001d9968af0000000008fad9bb4
+
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ .octa 0x000000000e4a77a200000001884938e4
+
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ .octa 0x00000000508c2ac800000001bc2e9bc0
+
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ .octa 0x0000000021572a8000000001f9658a68
+
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ .octa 0x00000001b859daf2000000001b9224fc
+
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ .octa 0x000000016f7884740000000055b2fb84
+
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ .octa 0x00000001b438810e000000018b090348
+
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ .octa 0x00000001d977c20c0000000007ae47f8
+
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ .octa 0x00000000ebedb99a0000000172acbec0
+
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ .octa 0x00000001df9e9e9200000001c6e3ff20
+
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ .octa 0x00000001a4a3f95200000000e1b38744
+
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ .octa 0x00000000e2f5122000000000791585b2
+
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ .octa 0x000000004aa01f3e00000000ac53b894
+
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ .octa 0x000000000c9ca2aa00000001df48b2e0
+
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ .octa 0x000000015168231600000000049c1c62
+
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ .octa 0x0000000036fce78c000000017c460c12
+
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ .octa 0x000000009037dc10000000015be4da7e
+
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ .octa 0x00000000d3298582000000010f38f668
+
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ .octa 0x00000001b42e8ad60000000039f40a00
+
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ .octa 0x00000000142a983800000000bd4c10c4
+
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ .octa 0x0000000109c7f1900000000042db1d98
+
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ .octa 0x0000000056ff931000000001c905bae6
+
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ .octa 0x00000001594513aa00000000069d40ea
+
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ .octa 0x000000011dd5fc080000000047bedd46
+
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ .octa 0x00000001675f0cc20000000026396bf8
+
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ .octa 0x00000000d1c8dd4400000000379beb92
+
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ .octa 0x0000000115ebd3d8000000000abae54a
+
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ .octa 0x00000001ecbd0dac0000000007e6a128
+
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ .octa 0x00000000cdf67af2000000000ade29d2
+
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ .octa 0x000000004c01ff4c00000000f974c45c
+
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ .octa 0x00000000f2d8657e00000000e77ac60a
+
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ .octa 0x000000006bae74c40000000145895816
+
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ .octa 0x0000000152af8aa00000000038e362be
+
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ .octa 0x0000000004663802000000007f991a64
+
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ .octa 0x00000001ab2f5afc00000000fa366d3a
+
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ .octa 0x0000000074a4ebd400000001a2bb34f0
+
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ .octa 0x00000001d7ab3a4c0000000028a9981e
+
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ .octa 0x00000001a8da60c600000001dbc672be
+
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ .octa 0x000000013cf6382000000000b04d77f6
+
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ .octa 0x00000000bec12e1e0000000124400d96
+
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ .octa 0x00000001c6368010000000014ca4b414
+
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ .octa 0x00000001e6e78758000000012fe2c938
+
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ .octa 0x000000008d7f2b3c00000001faed01e6
+
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ .octa 0x000000016b4a156e000000007e80ecfe
+
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ .octa 0x00000001c63cfeb60000000098daee94
+
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ .octa 0x000000015f902670000000010a04edea
+
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ .octa 0x00000001cd5de11e00000001c00b4524
+
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ .octa 0x000000001acaec540000000170296550
+
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ .octa 0x000000002bd0ca780000000181afaa48
+
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ .octa 0x0000000032d63d5c0000000185a31ffa
+
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ .octa 0x000000001c6d4e4c000000002469f608
+
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ .octa 0x0000000106a60b92000000006980102a
+
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ .octa 0x00000000d3855e120000000111ea9ca8
+
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ .octa 0x00000000e312563600000001bd1d29ce
+
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ .octa 0x000000009e8f7ea400000001b34b9580
+
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ .octa 0x00000001c82e562c000000003076054e
+
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ .octa 0x00000000ca9f09ce000000012a608ea4
+
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ .octa 0x00000000c63764e600000000784d05fe
+
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ .octa 0x0000000168d2e49e000000016ef0d82a
+
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ .octa 0x00000000e986c1480000000075bda454
+
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ .octa 0x00000000cfb65894000000003dc0a1c4
+
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ .octa 0x0000000111cadee400000000e9a5d8be
+
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ .octa 0x0000000171fb63ce00000001609bc4b4
+
+ .short_constants :
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include
+ the trailing 32 bits of zeros */
+ /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod
+ p(x)` */
+ .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+ /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod
+ p(x)` */
+ .octa 0x38e888d4844752a9963a18920246e2e6
+
+ /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod
+ p(x)` */
+ .octa 0x42316c00730206ad419a441956993a31
+
+ /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod
+ p(x)` */
+ .octa 0x543d5c543e65ddf9924752ba2b830011
+
+ /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod
+ p(x)` */
+ .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+ /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod
+ p(x)` */
+ .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+ /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod
+ p(x)` */
+ .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+ /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod
+ p(x)` */
+ .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+ /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)`
+ */
+ .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+ /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+ .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+ /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+ .octa 0x041d37768cd75659817cdc5119b29a35
+
+ /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+ .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+ /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+ .octa 0x0e148e8252377a554f256efcb82be955
+
+ /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+ .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+ /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+ .octa 0x790606ff9957c0a65d27e147510ac59a
+
+ /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+ .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+ .barrett_constants :
+ /* 33 bit reflected Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+ /* 33 bit reflected Barrett constant n */
+ .octa 0x00000000000000000000000105ec76f1
+#endif
diff --git a/src/rocksdb/util/crc32c_test.cc b/src/rocksdb/util/crc32c_test.cc
new file mode 100644
index 000000000..3e4f7396e
--- /dev/null
+++ b/src/rocksdb/util/crc32c_test.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "util/crc32c.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace crc32c {
+
+class CRC { };
+
+
+// Tests for 3-way crc32c algorithm. We need these tests because it uses
+// different lookup tables than the original Fast_CRC32
+const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t);
+char buffer[BUFFER_SIZE];
+
+struct ExpectedResult {
+ size_t offset;
+ size_t length;
+ uint32_t crc32c;
+};
+
+ExpectedResult expectedResults[] = {
+ // Zero-byte input
+ { 0, 0, ~0U },
+ // Small aligned inputs to test special cases in SIMD implementations
+ { 8, 1, 1543413366 },
+ { 8, 2, 523493126 },
+ { 8, 3, 1560427360 },
+ { 8, 4, 3422504776 },
+ { 8, 5, 447841138 },
+ { 8, 6, 3910050499 },
+ { 8, 7, 3346241981 },
+ // Small unaligned inputs
+ { 9, 1, 3855826643 },
+ { 10, 2, 560880875 },
+ { 11, 3, 1479707779 },
+ { 12, 4, 2237687071 },
+ { 13, 5, 4063855784 },
+ { 14, 6, 2553454047 },
+ { 15, 7, 1349220140 },
+ // Larger inputs to test leftover chunks at the end of aligned blocks
+ { 8, 8, 627613930 },
+ { 8, 9, 2105929409 },
+ { 8, 10, 2447068514 },
+ { 8, 11, 863807079 },
+ { 8, 12, 292050879 },
+ { 8, 13, 1411837737 },
+ { 8, 14, 2614515001 },
+ { 8, 15, 3579076296 },
+ { 8, 16, 2897079161 },
+ { 8, 17, 675168386 },
+ // // Much larger inputs
+ { 0, BUFFER_SIZE, 2096790750 },
+ { 1, BUFFER_SIZE / 2, 3854797577 },
+
+};
+
+TEST(CRC, StandardResults) {
+
+ // Original Fast_CRC32 tests.
+ // From rfc3720 section B.4.
+ char buf[32];
+
+ memset(buf, 0, sizeof(buf));
+ ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf)));
+
+ memset(buf, 0xff, sizeof(buf));
+ ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf)));
+
+ for (int i = 0; i < 32; i++) {
+ buf[i] = static_cast<char>(i);
+ }
+ ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf)));
+
+ for (int i = 0; i < 32; i++) {
+ buf[i] = static_cast<char>(31 - i);
+ }
+ ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf)));
+
+ unsigned char data[48] = {
+ 0x01, 0xc0, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x14, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x04, 0x00,
+ 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x18,
+ 0x28, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x02, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ };
+ ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+
+ // 3-Way Crc32c tests ported from folly.
+ // Test 1: single computation
+ for (auto expected : expectedResults) {
+ uint32_t result = Value(buffer + expected.offset, expected.length);
+ EXPECT_EQ(~expected.crc32c, result);
+ }
+
+ // Test 2: stitching two computations
+ for (auto expected : expectedResults) {
+ size_t partialLength = expected.length / 2;
+ uint32_t partialChecksum = Value(buffer + expected.offset, partialLength);
+ uint32_t result = Extend(partialChecksum,
+ buffer + expected.offset + partialLength,
+ expected.length - partialLength);
+ EXPECT_EQ(~expected.crc32c, result);
+ }
+
+}
+
+TEST(CRC, Values) {
+ ASSERT_NE(Value("a", 1), Value("foo", 3));
+}
+
+TEST(CRC, Extend) {
+ ASSERT_EQ(Value("hello world", 11),
+ Extend(Value("hello ", 6), "world", 5));
+}
+
+TEST(CRC, Mask) {
+ uint32_t crc = Value("foo", 3);
+ ASSERT_NE(crc, Mask(crc));
+ ASSERT_NE(crc, Mask(Mask(crc)));
+ ASSERT_EQ(crc, Unmask(Mask(crc)));
+ ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
+}
+
+} // namespace crc32c
+} // namespace ROCKSDB_NAMESPACE
+
+// copied from folly
+const uint64_t FNV_64_HASH_START = 14695981039346656037ULL;
+inline uint64_t fnv64_buf(const void* buf,
+ size_t n,
+ uint64_t hash = FNV_64_HASH_START) {
+ // forcing signed char, since other platforms can use unsigned
+ const signed char* char_buf = reinterpret_cast<const signed char*>(buf);
+
+ for (size_t i = 0; i < n; ++i) {
+ hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) +
+ (hash << 8) + (hash << 40);
+ hash ^= char_buf[i];
+ }
+ return hash;
+}
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+ // Populate a buffer with a deterministic pattern
+ // on which to compute checksums
+
+ const uint8_t* src = (uint8_t*)ROCKSDB_NAMESPACE::crc32c::buffer;
+ uint64_t* dst = (uint64_t*)ROCKSDB_NAMESPACE::crc32c::buffer;
+ const uint64_t* end =
+ (const uint64_t*)(ROCKSDB_NAMESPACE::crc32c::buffer +
+ ROCKSDB_NAMESPACE::crc32c::BUFFER_SIZE);
+ *dst++ = 0;
+ while (dst < end) {
+ ROCKSDB_NAMESPACE::EncodeFixed64(
+ reinterpret_cast<char*>(dst),
+ fnv64_buf((const char*)src, sizeof(uint64_t)));
+ dst++;
+ src += sizeof(uint64_t);
+ }
+
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/defer.h b/src/rocksdb/util/defer.h
new file mode 100644
index 000000000..cb0b42a36
--- /dev/null
+++ b/src/rocksdb/util/defer.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Defers the execution of the provided function until the Defer
+// object goes out of scope.
+//
+// Usage example:
+//
+// Status DeferTest() {
+// Status s;
+// Defer defer([&s]() {
+// if (!s.ok()) {
+// // do cleanups ...
+// }
+// });
+// // do something ...
+// if (!s.ok()) return;
+// // do some other things ...
+// return s;
+// }
+//
+// The above code ensures that cleanups will always happen on returning.
+//
+// Without the help of Defer, you can
+// 1. every time when !s.ok(), do the cleanup;
+// 2. instead of returning when !s.ok(), continue the work only when s.ok(),
+// but sometimes, this might lead to nested blocks of "if (s.ok()) {...}".
+//
+// With the help of Defer, you can centralize the cleanup logic inside the
+// lambda passed to Defer, and you can return immediately on failure when necessary.
+class Defer final {
+ public:
+ Defer(std::function<void()>&& fn) : fn_(std::move(fn)) {}
+ ~Defer() { fn_(); }
+
+ // Disallow copy.
+ Defer(const Defer&) = delete;
+ Defer& operator=(const Defer&) = delete;
+
+ private:
+ std::function<void()> fn_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/defer_test.cc b/src/rocksdb/util/defer_test.cc
new file mode 100644
index 000000000..e13b25efb
--- /dev/null
+++ b/src/rocksdb/util/defer_test.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "util/defer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeferTest {};
+
+TEST(DeferTest, BlockScope) {
+ int v = 1;
+ {
+ Defer defer([&v]() { v *= 2; });
+ }
+ ASSERT_EQ(2, v);
+}
+
+TEST(DeferTest, FunctionScope) {
+ int v = 1;
+ auto f = [&v]() {
+ Defer defer([&v]() { v *= 2; });
+ v = 2;
+ };
+ f();
+ ASSERT_EQ(4, v);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/duplicate_detector.h b/src/rocksdb/util/duplicate_detector.h
new file mode 100644
index 000000000..72920ca3c
--- /dev/null
+++ b/src/rocksdb/util/duplicate_detector.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "util/set_comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+// During recovery if the memtable is flushed we cannot rely on its help on
+// duplicate key detection and as key insert will not be attempted. This class
+// will be used as a emulator of memtable to tell if insertion of a key/seq
+// would have resulted in duplication.
+class DuplicateDetector {
+ public:
+ explicit DuplicateDetector(DBImpl* db) : db_(db) {}
+ bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) {
+ assert(seq >= batch_seq_);
+ if (batch_seq_ != seq) { // it is a new batch
+ keys_.clear();
+ }
+ batch_seq_ = seq;
+ CFKeys& cf_keys = keys_[cf];
+ if (cf_keys.size() == 0) { // just inserted
+ InitWithComp(cf);
+ }
+ auto it = cf_keys.insert(key);
+ if (it.second == false) { // second is false if a element already existed.
+ keys_.clear();
+ InitWithComp(cf);
+ keys_[cf].insert(key);
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ SequenceNumber batch_seq_ = 0;
+ DBImpl* db_;
+ using CFKeys = std::set<Slice, SetComparator>;
+ std::map<uint32_t, CFKeys> keys_;
+ void InitWithComp(const uint32_t cf) {
+ auto h = db_->GetColumnFamilyHandle(cf);
+ if (!h) {
+ // TODO(myabandeh): This is not a concern in MyRocks as drop cf is not
+ // implemented yet. When it does, we should return proper error instead
+ // of throwing exception.
+ ROCKS_LOG_FATAL(
+ db_->immutable_db_options().info_log,
+ "Recovering an entry from the dropped column family %" PRIu32
+ ". WAL must must have been emptied before dropping the column "
+ "family", cf);
+#ifndef ROCKSDB_LITE
+ throw std::runtime_error(
+ "Recovering an entry from a dropped column family. "
+ "WAL must must have been flushed before dropping the column "
+ "family");
+#endif
+ return;
+ }
+ auto cmp = h->GetComparator();
+ keys_[cf] = CFKeys(SetComparator(cmp));
+ }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/dynamic_bloom.cc b/src/rocksdb/util/dynamic_bloom.cc
new file mode 100644
index 000000000..60d4b2cf2
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "dynamic_bloom.h"
+
+#include <algorithm>
+
+#include "memory/allocator.h"
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint32_t roundUpToPow2(uint32_t x) {
+ uint32_t rv = 1;
+ while (rv < x) {
+ rv <<= 1;
+ }
+ return rv;
+}
+}
+
+DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
+ uint32_t num_probes, size_t huge_page_tlb_size,
+ Logger* logger)
+ // Round down, except round up with 1
+ : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) {
+ assert(num_probes % 2 == 0); // limitation of current implementation
+ assert(num_probes <= 10); // limitation of current implementation
+ assert(kNumDoubleProbes > 0);
+
+ // Determine how much to round off + align by so that x ^ i (that's xor) is
+ // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes.
+ uint32_t block_bytes = /*bytes/u64*/ 8 *
+ /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes));
+ uint32_t block_bits = block_bytes * 8;
+ uint32_t blocks = (total_bits + block_bits - 1) / block_bits;
+ uint32_t sz = blocks * block_bytes;
+ kLen = sz / /*bytes/u64*/ 8;
+ assert(kLen > 0);
+#ifndef NDEBUG
+ for (uint32_t i = 0; i < kNumDoubleProbes; ++i) {
+ // Ensure probes starting at last word are in range
+ assert(((kLen - 1) ^ i) < kLen);
+ }
+#endif
+
+ // Padding to correct for allocation not originally aligned on block_bytes
+ // boundary
+ sz += block_bytes - 1;
+ assert(allocator);
+
+ char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+ memset(raw, 0, sz);
+ auto block_offset = reinterpret_cast<uintptr_t>(raw) % block_bytes;
+ if (block_offset > 0) {
+ // Align on block_bytes boundary
+ raw += block_bytes - block_offset;
+ }
+ static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+ "Expecting zero-space-overhead atomic");
+ data_ = reinterpret_cast<std::atomic<uint64_t>*>(raw);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/dynamic_bloom.h b/src/rocksdb/util/dynamic_bloom.h
new file mode 100644
index 000000000..d1f22cc75
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom.h
@@ -0,0 +1,214 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "table/multiget_context.h"
+#include "util/hash.h"
+
+#include <atomic>
+#include <memory>
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Allocator;
+class Logger;
+
+// A Bloom filter intended only to be used in memory, never serialized in a way
+// that could lead to schema incompatibility. Supports opt-in lock-free
+// concurrent access.
+//
+// This implementation is also intended for applications generally preferring
+// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate.
+// For 1% FP rate, that means that the latency of a look-up triggered by an FP
+// should be less than roughly 100x the cost of a Bloom filter op.
+//
+// For simplicity and performance, the current implementation requires
+// num_probes to be a multiple of two and <= 10.
+//
+class DynamicBloom {
+ public:
+ // allocator: pass allocator to bloom filter, hence trace the usage of memory
+ // total_bits: fixed total bits for the bloom
+ // num_probes: number of hash probes for a single key
+ // hash_func: customized hash function
+ // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
+ // within this page size. Need to reserve huge pages for
+ // it to be allocated, like:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt
+ explicit DynamicBloom(Allocator* allocator, uint32_t total_bits,
+ uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+ Logger* logger = nullptr);
+
+ ~DynamicBloom() {}
+
+ // Assuming single threaded access to this function.
+ void Add(const Slice& key);
+
+ // Like Add, but may be called concurrent with other functions.
+ void AddConcurrently(const Slice& key);
+
+ // Assuming single threaded access to this function.
+ void AddHash(uint32_t hash);
+
+ // Like AddHash, but may be called concurrent with other functions.
+ void AddHashConcurrently(uint32_t hash);
+
+ // Multithreaded access to this function is OK
+ bool MayContain(const Slice& key) const;
+
+ void MayContain(int num_keys, Slice** keys, bool* may_match) const;
+
+ // Multithreaded access to this function is OK
+ bool MayContainHash(uint32_t hash) const;
+
+ void Prefetch(uint32_t h);
+
+ private:
+ // Length of the structure, in 64-bit words. For this structure, "word"
+ // will always refer to 64-bit words.
+ uint32_t kLen;
+ // We make the k probes in pairs, two for each 64-bit read/write. Thus,
+ // this stores k/2, the number of words to double-probe.
+ const uint32_t kNumDoubleProbes;
+
+ std::atomic<uint64_t>* data_;
+
+ // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
+ // concurrency safety, working with bytes.
+ template <typename OrFunc>
+ void AddHash(uint32_t hash, const OrFunc& or_func);
+
+ bool DoubleProbe(uint32_t h32, size_t a) const;
+};
+
+inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
+
+inline void DynamicBloom::AddConcurrently(const Slice& key) {
+ AddHashConcurrently(BloomHash(key));
+}
+
+inline void DynamicBloom::AddHash(uint32_t hash) {
+ AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
+ ptr->store(ptr->load(std::memory_order_relaxed) | mask,
+ std::memory_order_relaxed);
+ });
+}
+
+inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
+ AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
+ // Happens-before between AddHash and MaybeContains is handled by
+ // access to versions_->LastSequence(), so all we have to do here is
+ // avoid races (so we don't give the compiler a license to mess up
+ // our code) and not lose bits. std::memory_order_relaxed is enough
+ // for that.
+ if ((mask & ptr->load(std::memory_order_relaxed)) != mask) {
+ ptr->fetch_or(mask, std::memory_order_relaxed);
+ }
+ });
+}
+
+inline bool DynamicBloom::MayContain(const Slice& key) const {
+ return (MayContainHash(BloomHash(key)));
+}
+
+inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
+ bool* may_match) const {
+ std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+ std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+ for (int i = 0; i < num_keys; ++i) {
+ hashes[i] = BloomHash(*keys[i]);
+ size_t a = fastrange32(kLen, hashes[i]);
+ PREFETCH(data_ + a, 0, 3);
+ byte_offsets[i] = a;
+ }
+
+ for (int i = 0; i < num_keys; i++) {
+ may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]);
+ }
+}
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void DynamicBloom::Prefetch(uint32_t h32) {
+ size_t a = fastrange32(kLen, h32);
+ PREFETCH(data_ + a, 0, 3);
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+// Speed hacks in this implementation:
+// * Uses fastrange instead of %
+// * Minimum logic to determine first (and all) probed memory addresses.
+// (Uses constant bit-xor offsets from the starting probe address.)
+// * (Major) Two probes per 64-bit memory fetch/write.
+// Code simplification / optimization: only allow even number of probes.
+// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At
+// least on recent CPUs, integer multiplication is very cheap. Each 64-bit
+// remix provides five pairs of bit addresses within a uint64_t.)
+// Code simplification / optimization: only allow up to 10 probes, from a
+// single 64-bit remix.
+//
+// The FP rate penalty for this implementation, vs. standard Bloom filter, is
+// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom.
+// This implementation does not explicitly use the cache line size, but is
+// effectively cache-local (up to 16 probes) because of the bit-xor offsetting.
+//
+// NB: could easily be upgraded to support a 64-bit hash and
+// total_bits > 2^32 (512MB). (The latter is a bad idea without the former,
+// because of false positives.)
+
+inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
+ size_t a = fastrange32(kLen, h32);
+ PREFETCH(data_ + a, 0, 3);
+ return DoubleProbe(h32, a);
+}
+
+inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
+ // Expand/remix with 64-bit golden ratio
+ uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+ for (unsigned i = 0;; ++i) {
+ // Two bit probes per uint64_t probe
+ uint64_t mask =
+ ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
+ uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
+ if (i + 1 >= kNumDoubleProbes) {
+ return (val & mask) == mask;
+ } else if ((val & mask) != mask) {
+ return false;
+ }
+ h = (h >> 12) | (h << 52);
+ }
+}
+
+template <typename OrFunc>
+inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
+ size_t a = fastrange32(kLen, h32);
+ PREFETCH(data_ + a, 0, 3);
+ // Expand/remix with 64-bit golden ratio
+ uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+ for (unsigned i = 0;; ++i) {
+ // Two bit probes per uint64_t probe
+ uint64_t mask =
+ ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
+ or_func(&data_[a ^ i], mask);
+ if (i + 1 >= kNumDoubleProbes) {
+ return;
+ }
+ h = (h >> 12) | (h << 52);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/dynamic_bloom_test.cc b/src/rocksdb/util/dynamic_bloom_test.cc
new file mode 100644
index 000000000..47d04a36c
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom_test.cc
@@ -0,0 +1,324 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+ return 0;
+}
+#else
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "dynamic_bloom.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/stop_watch.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(bits_per_key, 10, "");
+DEFINE_int32(num_probes, 6, "");
+DEFINE_bool(enable_perf, false, "");
+
+namespace ROCKSDB_NAMESPACE {
+
+struct KeyMaker {
+ uint64_t a;
+ uint64_t b;
+
+ // Sequential, within a hash function block
+ inline Slice Seq(uint64_t i) {
+ a = i;
+ return Slice(reinterpret_cast<char *>(&a), sizeof(a));
+ }
+ // Not quite sequential, varies across hash function blocks
+ inline Slice Nonseq(uint64_t i) {
+ a = i;
+ b = i * 123;
+ return Slice(reinterpret_cast<char *>(this), sizeof(*this));
+ }
+ inline Slice Key(uint64_t i, bool nonseq) {
+ return nonseq ? Nonseq(i) : Seq(i);
+ }
+};
+
+class DynamicBloomTest : public testing::Test {};
+
+TEST_F(DynamicBloomTest, EmptyFilter) {
+ Arena arena;
+ DynamicBloom bloom1(&arena, 100, 2);
+ ASSERT_TRUE(!bloom1.MayContain("hello"));
+ ASSERT_TRUE(!bloom1.MayContain("world"));
+
+ DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
+ ASSERT_TRUE(!bloom2.MayContain("hello"));
+ ASSERT_TRUE(!bloom2.MayContain("world"));
+}
+
+TEST_F(DynamicBloomTest, Small) {
+ Arena arena;
+ DynamicBloom bloom1(&arena, 100, 2);
+ bloom1.Add("hello");
+ bloom1.Add("world");
+ ASSERT_TRUE(bloom1.MayContain("hello"));
+ ASSERT_TRUE(bloom1.MayContain("world"));
+ ASSERT_TRUE(!bloom1.MayContain("x"));
+ ASSERT_TRUE(!bloom1.MayContain("foo"));
+
+ DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
+ bloom2.Add("hello");
+ bloom2.Add("world");
+ ASSERT_TRUE(bloom2.MayContain("hello"));
+ ASSERT_TRUE(bloom2.MayContain("world"));
+ ASSERT_TRUE(!bloom2.MayContain("x"));
+ ASSERT_TRUE(!bloom2.MayContain("foo"));
+}
+
+TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
+ Arena arena;
+ DynamicBloom bloom1(&arena, 100, 2);
+ bloom1.AddConcurrently("hello");
+ bloom1.AddConcurrently("world");
+ ASSERT_TRUE(bloom1.MayContain("hello"));
+ ASSERT_TRUE(bloom1.MayContain("world"));
+ ASSERT_TRUE(!bloom1.MayContain("x"));
+ ASSERT_TRUE(!bloom1.MayContain("foo"));
+
+ DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
+ bloom2.AddConcurrently("hello");
+ bloom2.AddConcurrently("world");
+ ASSERT_TRUE(bloom2.MayContain("hello"));
+ ASSERT_TRUE(bloom2.MayContain("world"));
+ ASSERT_TRUE(!bloom2.MayContain("x"));
+ ASSERT_TRUE(!bloom2.MayContain("foo"));
+}
+
+static uint32_t NextNum(uint32_t num) {
+ if (num < 10) {
+ num += 1;
+ } else if (num < 100) {
+ num += 10;
+ } else if (num < 1000) {
+ num += 100;
+ } else {
+ num = num * 26 / 10;
+ }
+ return num;
+}
+
+TEST_F(DynamicBloomTest, VaryingLengths) {
+ KeyMaker km;
+
+ // Count number of filters that significantly exceed the false positive rate
+ int mediocre_filters = 0;
+ int good_filters = 0;
+ uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+ fprintf(stderr, "bits_per_key: %d num_probes: %d\n", FLAGS_bits_per_key,
+ num_probes);
+
+ // NB: FP rate impact of 32-bit hash is noticeable starting around 10M keys.
+ // But that effect is hidden if using sequential keys (unique hashes).
+ for (bool nonseq : {false, true}) {
+ const uint32_t max_num = FLAGS_enable_perf ? 40000000 : 400000;
+ for (uint32_t num = 1; num <= max_num; num = NextNum(num)) {
+ uint32_t bloom_bits = 0;
+ Arena arena;
+ bloom_bits = num * FLAGS_bits_per_key;
+ DynamicBloom bloom(&arena, bloom_bits, num_probes);
+ for (uint64_t i = 0; i < num; i++) {
+ bloom.Add(km.Key(i, nonseq));
+ ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
+ }
+
+ // All added keys must match
+ for (uint64_t i = 0; i < num; i++) {
+ ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
+ }
+
+ // Check false positive rate
+ int result = 0;
+ for (uint64_t i = 0; i < 30000; i++) {
+ if (bloom.MayContain(km.Key(i + 1000000000, nonseq))) {
+ result++;
+ }
+ }
+ double rate = result / 30000.0;
+
+ fprintf(stderr,
+ "False positives (%s keys): "
+ "%5.2f%% @ num = %6u, bloom_bits = %6u\n",
+ nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits);
+
+ if (rate > 0.0125)
+ mediocre_filters++; // Allowed, but not too often
+ else
+ good_filters++;
+ }
+ }
+
+ fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
+ mediocre_filters);
+ ASSERT_LE(mediocre_filters, good_filters / 25);
+}
+
+TEST_F(DynamicBloomTest, perf) {
+ KeyMaker km;
+ StopWatchNano timer(Env::Default());
+ uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+ if (!FLAGS_enable_perf) {
+ return;
+ }
+
+ for (uint32_t m = 1; m <= 8; ++m) {
+ Arena arena;
+ const uint32_t num_keys = m * 8 * 1024 * 1024;
+ fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
+
+ DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
+
+ timer.Start();
+ for (uint64_t i = 1; i <= num_keys; ++i) {
+ std_bloom.Add(km.Seq(i));
+ }
+
+ uint64_t elapsed = timer.ElapsedNanos();
+ fprintf(stderr, "dynamic bloom, avg add latency %3g\n",
+ static_cast<double>(elapsed) / num_keys);
+
+ uint32_t count = 0;
+ timer.Start();
+ for (uint64_t i = 1; i <= num_keys; ++i) {
+ if (std_bloom.MayContain(km.Seq(i))) {
+ ++count;
+ }
+ }
+ ASSERT_EQ(count, num_keys);
+ elapsed = timer.ElapsedNanos();
+ assert(count > 0);
+ fprintf(stderr, "dynamic bloom, avg query latency %3g\n",
+ static_cast<double>(elapsed) / count);
+ }
+}
+
+TEST_F(DynamicBloomTest, concurrent_with_perf) {
+ uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+ uint32_t m_limit = FLAGS_enable_perf ? 8 : 1;
+
+ uint32_t num_threads = 4;
+ std::vector<port::Thread> threads;
+
+ // NB: Uses sequential keys for speed, but that hides the FP rate
+ // impact of 32-bit hash, which is noticeable starting around 10M keys
+ // when they vary across hashing blocks.
+ for (uint32_t m = 1; m <= m_limit; ++m) {
+ Arena arena;
+ const uint32_t num_keys = m * 8 * 1024 * 1024;
+ fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
+
+ DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
+
+ std::atomic<uint64_t> elapsed(0);
+
+ std::function<void(size_t)> adder([&](size_t t) {
+ KeyMaker km;
+ StopWatchNano timer(Env::Default());
+ timer.Start();
+ for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+ std_bloom.AddConcurrently(km.Seq(i));
+ }
+ elapsed += timer.ElapsedNanos();
+ });
+ for (size_t t = 0; t < num_threads; ++t) {
+ threads.emplace_back(adder, t);
+ }
+ while (threads.size() > 0) {
+ threads.back().join();
+ threads.pop_back();
+ }
+
+ fprintf(stderr,
+ "dynamic bloom, avg parallel add latency %3g"
+ " nanos/key\n",
+ static_cast<double>(elapsed) / num_threads / num_keys);
+
+ elapsed = 0;
+ std::function<void(size_t)> hitter([&](size_t t) {
+ KeyMaker km;
+ StopWatchNano timer(Env::Default());
+ timer.Start();
+ for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+ bool f = std_bloom.MayContain(km.Seq(i));
+ ASSERT_TRUE(f);
+ }
+ elapsed += timer.ElapsedNanos();
+ });
+ for (size_t t = 0; t < num_threads; ++t) {
+ threads.emplace_back(hitter, t);
+ }
+ while (threads.size() > 0) {
+ threads.back().join();
+ threads.pop_back();
+ }
+
+ fprintf(stderr,
+ "dynamic bloom, avg parallel hit latency %3g"
+ " nanos/key\n",
+ static_cast<double>(elapsed) / num_threads / num_keys);
+
+ elapsed = 0;
+ std::atomic<uint32_t> false_positives(0);
+ std::function<void(size_t)> misser([&](size_t t) {
+ KeyMaker km;
+ StopWatchNano timer(Env::Default());
+ timer.Start();
+ for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; i += num_threads) {
+ bool f = std_bloom.MayContain(km.Seq(i));
+ if (f) {
+ ++false_positives;
+ }
+ }
+ elapsed += timer.ElapsedNanos();
+ });
+ for (size_t t = 0; t < num_threads; ++t) {
+ threads.emplace_back(misser, t);
+ }
+ while (threads.size() > 0) {
+ threads.back().join();
+ threads.pop_back();
+ }
+
+ fprintf(stderr,
+ "dynamic bloom, avg parallel miss latency %3g"
+ " nanos/key, %f%% false positive rate\n",
+ static_cast<double>(elapsed) / num_threads / num_keys,
+ false_positives.load() * 100.0 / num_keys);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ return RUN_ALL_TESTS();
+}
+
+#endif // GFLAGS
diff --git a/src/rocksdb/util/file_checksum_helper.cc b/src/rocksdb/util/file_checksum_helper.cc
new file mode 100644
index 000000000..51dcf6a82
--- /dev/null
+++ b/src/rocksdb/util/file_checksum_helper.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/file_checksum_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FileChecksumListImpl::reset() { checksum_map_.clear(); }
+
+size_t FileChecksumListImpl::size() const { return checksum_map_.size(); }
+
+Status FileChecksumListImpl::GetAllFileChecksums(
+ std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+ std::vector<std::string>* checksum_func_names) {
+ if (file_numbers == nullptr || checksums == nullptr ||
+ checksum_func_names == nullptr) {
+ return Status::InvalidArgument("Pointer has not been initiated");
+ }
+
+ for (auto i : checksum_map_) {
+ file_numbers->push_back(i.first);
+ checksums->push_back(i.second.first);
+ checksum_func_names->push_back(i.second.second);
+ }
+ return Status::OK();
+}
+
+Status FileChecksumListImpl::SearchOneFileChecksum(
+ uint64_t file_number, std::string* checksum,
+ std::string* checksum_func_name) {
+ if (checksum == nullptr || checksum_func_name == nullptr) {
+ return Status::InvalidArgument("Pointer has not been initiated");
+ }
+
+ auto it = checksum_map_.find(file_number);
+ if (it == checksum_map_.end()) {
+ return Status::NotFound();
+ } else {
+ *checksum = it->second.first;
+ *checksum_func_name = it->second.second;
+ }
+ return Status::OK();
+}
+
+Status FileChecksumListImpl::InsertOneFileChecksum(
+ uint64_t file_number, const std::string& checksum,
+ const std::string& checksum_func_name) {
+ auto it = checksum_map_.find(file_number);
+ if (it == checksum_map_.end()) {
+ checksum_map_.insert(std::make_pair(
+ file_number, std::make_pair(checksum, checksum_func_name)));
+ } else {
+ it->second.first = checksum;
+ it->second.second = checksum_func_name;
+ }
+ return Status::OK();
+}
+
+Status FileChecksumListImpl::RemoveOneFileChecksum(uint64_t file_number) {
+ auto it = checksum_map_.find(file_number);
+ if (it == checksum_map_.end()) {
+ return Status::NotFound();
+ } else {
+ checksum_map_.erase(it);
+ }
+ return Status::OK();
+}
+
+FileChecksumList* NewFileChecksumList() {
+ FileChecksumListImpl* checksum_list = new FileChecksumListImpl();
+ return checksum_list;
+}
+
+FileChecksumFunc* CreateFileChecksumFuncCrc32c() {
+ FileChecksumFunc* file_checksum_crc32c = new FileChecksumFuncCrc32c();
+ return file_checksum_crc32c;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/file_checksum_helper.h b/src/rocksdb/util/file_checksum_helper.h
new file mode 100644
index 000000000..7ad9ea732
--- /dev/null
+++ b/src/rocksdb/util/file_checksum_helper.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <cassert>
+#include <unordered_map>
+#include "port/port.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/status.h"
+#include "util/crc32c.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is the class to generate the file checksum based on Crc32. It
+// will be used as the default checksum method for SST file checksum
+class FileChecksumFuncCrc32c : public FileChecksumFunc {
+ public:
+ std::string Extend(const std::string& init_checksum, const char* data,
+ size_t n) override {
+ assert(data != nullptr);
+ uint32_t checksum_value = StringToUint32(init_checksum);
+ return Uint32ToString(crc32c::Extend(checksum_value, data, n));
+ }
+
+ std::string Value(const char* data, size_t n) override {
+ assert(data != nullptr);
+ return Uint32ToString(crc32c::Value(data, n));
+ }
+
+ std::string ProcessChecksum(const std::string& checksum) override {
+ uint32_t checksum_value = StringToUint32(checksum);
+ return Uint32ToString(crc32c::Mask(checksum_value));
+ }
+
+ const char* Name() const override { return "FileChecksumCrc32c"; }
+
+ // Convert a uint32_t type data into a 4 bytes string.
+ static std::string Uint32ToString(uint32_t v) {
+ std::string s;
+ if (port::kLittleEndian) {
+ s.append(reinterpret_cast<char*>(&v), sizeof(v));
+ } else {
+ char buf[sizeof(v)];
+ buf[0] = v & 0xff;
+ buf[1] = (v >> 8) & 0xff;
+ buf[2] = (v >> 16) & 0xff;
+ buf[3] = (v >> 24) & 0xff;
+ s.append(buf, sizeof(v));
+ }
+ size_t i = 0, j = s.size() - 1;
+ while (i < j) {
+ char tmp = s[i];
+ s[i] = s[j];
+ s[j] = tmp;
+ ++i;
+ --j;
+ }
+ return s;
+ }
+
+ // Convert a 4 bytes size string into a uint32_t type data.
+ static uint32_t StringToUint32(std::string s) {
+ assert(s.size() == sizeof(uint32_t));
+ size_t i = 0, j = s.size() - 1;
+ while (i < j) {
+ char tmp = s[i];
+ s[i] = s[j];
+ s[j] = tmp;
+ ++i;
+ --j;
+ }
+ uint32_t v = 0;
+ if (port::kLittleEndian) {
+ memcpy(&v, s.c_str(), sizeof(uint32_t));
+ } else {
+ const char* buf = s.c_str();
+ v |= static_cast<uint32_t>(buf[0]);
+ v |= (static_cast<uint32_t>(buf[1]) << 8);
+ v |= (static_cast<uint32_t>(buf[2]) << 16);
+ v |= (static_cast<uint32_t>(buf[3]) << 24);
+ }
+ return v;
+ }
+};
+
+// The default implementaion of FileChecksumList
+class FileChecksumListImpl : public FileChecksumList {
+ public:
+ FileChecksumListImpl() {}
+ void reset() override;
+
+ size_t size() const override;
+
+ Status GetAllFileChecksums(
+ std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+ std::vector<std::string>* checksum_func_names) override;
+
+ Status SearchOneFileChecksum(uint64_t file_number, std::string* checksum,
+ std::string* checksum_func_name) override;
+
+ Status InsertOneFileChecksum(uint64_t file_number,
+ const std::string& checksum,
+ const std::string& checksum_func_name) override;
+
+ Status RemoveOneFileChecksum(uint64_t file_number) override;
+
+ private:
+ // Key is the file number, the first portion of the value is checksum, the
+ // second portion of the value is checksum function name.
+ std::unordered_map<uint64_t, std::pair<std::string, std::string>>
+ checksum_map_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/file_reader_writer_test.cc b/src/rocksdb/util/file_reader_writer_test.cc
new file mode 100644
index 000000000..f37bd5931
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer_test.cc
@@ -0,0 +1,444 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include <algorithm>
+#include <vector>
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "file/readahead_raf.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriterTest : public testing::Test {};
+
+const uint32_t kMb = 1 << 20;
+
+TEST_F(WritableFileWriterTest, RangeSync) {
+ class FakeWF : public WritableFile {
+ public:
+ explicit FakeWF() : size_(0), last_synced_(0) {}
+ ~FakeWF() override {}
+
+ Status Append(const Slice& data) override {
+ size_ += data.size();
+ return Status::OK();
+ }
+ Status Truncate(uint64_t /*size*/) override { return Status::OK(); }
+ Status Close() override {
+ EXPECT_GE(size_, last_synced_ + kMb);
+ EXPECT_LT(size_, last_synced_ + 2 * kMb);
+ // Make sure random writes generated enough writes.
+ EXPECT_GT(size_, 10 * kMb);
+ return Status::OK();
+ }
+ Status Flush() override { return Status::OK(); }
+ Status Sync() override { return Status::OK(); }
+ Status Fsync() override { return Status::OK(); }
+ void SetIOPriority(Env::IOPriority /*pri*/) override {}
+ uint64_t GetFileSize() override { return size_; }
+ void GetPreallocationStatus(size_t* /*block_size*/,
+ size_t* /*last_allocated_block*/) override {}
+ size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+ return 0;
+ }
+ Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+ return Status::OK();
+ }
+
+ protected:
+ Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
+ return Status::OK();
+ }
+ Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+ EXPECT_EQ(offset % 4096, 0u);
+ EXPECT_EQ(nbytes % 4096, 0u);
+
+ EXPECT_EQ(offset, last_synced_);
+ last_synced_ = offset + nbytes;
+ EXPECT_GE(size_, last_synced_ + kMb);
+ if (size_ > 2 * kMb) {
+ EXPECT_LT(size_, last_synced_ + 2 * kMb);
+ }
+ return Status::OK();
+ }
+
+ uint64_t size_;
+ uint64_t last_synced_;
+ };
+
+ EnvOptions env_options;
+ env_options.bytes_per_sync = kMb;
+ std::unique_ptr<FakeWF> wf(new FakeWF);
+ std::unique_ptr<WritableFileWriter> writer(
+ new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
+ "" /* don't care */, env_options));
+ Random r(301);
+ std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
+ for (int i = 0; i < 1000; i++) {
+ int skew_limit = (i < 700) ? 10 : 15;
+ uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
+ writer->Append(Slice(large_buf.get(), num));
+
+ // Flush in a chance of 1/10.
+ if (r.Uniform(10) == 0) {
+ writer->Flush();
+ }
+ }
+ writer->Close();
+}
+
+TEST_F(WritableFileWriterTest, IncrementalBuffer) {
+ class FakeWF : public WritableFile {
+ public:
+ explicit FakeWF(std::string* _file_data, bool _use_direct_io,
+ bool _no_flush)
+ : file_data_(_file_data),
+ use_direct_io_(_use_direct_io),
+ no_flush_(_no_flush) {}
+ ~FakeWF() override {}
+
+ Status Append(const Slice& data) override {
+ file_data_->append(data.data(), data.size());
+ size_ += data.size();
+ return Status::OK();
+ }
+ Status PositionedAppend(const Slice& data, uint64_t pos) override {
+ EXPECT_TRUE(pos % 512 == 0);
+ EXPECT_TRUE(data.size() % 512 == 0);
+ file_data_->resize(pos);
+ file_data_->append(data.data(), data.size());
+ size_ += data.size();
+ return Status::OK();
+ }
+
+ Status Truncate(uint64_t size) override {
+ file_data_->resize(size);
+ return Status::OK();
+ }
+ Status Close() override { return Status::OK(); }
+ Status Flush() override { return Status::OK(); }
+ Status Sync() override { return Status::OK(); }
+ Status Fsync() override { return Status::OK(); }
+ void SetIOPriority(Env::IOPriority /*pri*/) override {}
+ uint64_t GetFileSize() override { return size_; }
+ void GetPreallocationStatus(size_t* /*block_size*/,
+ size_t* /*last_allocated_block*/) override {}
+ size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+ return 0;
+ }
+ Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+ return Status::OK();
+ }
+ bool use_direct_io() const override { return use_direct_io_; }
+
+ std::string* file_data_;
+ bool use_direct_io_;
+ bool no_flush_;
+ size_t size_ = 0;
+ };
+
+ Random r(301);
+ const int kNumAttempts = 50;
+ for (int attempt = 0; attempt < kNumAttempts; attempt++) {
+ bool no_flush = (attempt % 3 == 0);
+ EnvOptions env_options;
+ env_options.writable_file_max_buffer_size =
+ (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024;
+ std::string actual;
+ std::unique_ptr<FakeWF> wf(new FakeWF(&actual,
+#ifndef ROCKSDB_LITE
+ attempt % 2 == 1,
+#else
+ false,
+#endif
+ no_flush));
+ std::unique_ptr<WritableFileWriter> writer(
+ new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
+ "" /* don't care */, env_options));
+
+ std::string target;
+ for (int i = 0; i < 20; i++) {
+ uint32_t num = r.Skewed(16) * 100 + r.Uniform(100);
+ std::string random_string;
+ test::RandomString(&r, num, &random_string);
+ writer->Append(Slice(random_string.c_str(), num));
+ target.append(random_string.c_str(), num);
+
+ // In some attempts, flush in a chance of 1/10.
+ if (!no_flush && r.Uniform(10) == 0) {
+ writer->Flush();
+ }
+ }
+ writer->Flush();
+ writer->Close();
+ ASSERT_EQ(target.size(), actual.size());
+ ASSERT_EQ(target, actual);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WritableFileWriterTest, AppendStatusReturn) {
+ class FakeWF : public WritableFile {
+ public:
+ explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
+
+ bool use_direct_io() const override { return use_direct_io_; }
+ Status Append(const Slice& /*data*/) override {
+ if (io_error_) {
+ return Status::IOError("Fake IO error");
+ }
+ return Status::OK();
+ }
+ Status PositionedAppend(const Slice& /*data*/, uint64_t) override {
+ if (io_error_) {
+ return Status::IOError("Fake IO error");
+ }
+ return Status::OK();
+ }
+ Status Close() override { return Status::OK(); }
+ Status Flush() override { return Status::OK(); }
+ Status Sync() override { return Status::OK(); }
+ void Setuse_direct_io(bool val) { use_direct_io_ = val; }
+ void SetIOError(bool val) { io_error_ = val; }
+
+ protected:
+ bool use_direct_io_;
+ bool io_error_;
+ };
+ std::unique_ptr<FakeWF> wf(new FakeWF());
+ wf->Setuse_direct_io(true);
+ std::unique_ptr<WritableFileWriter> writer(
+ new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
+ "" /* don't care */, EnvOptions()));
+
+ ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
+
+ // Next call to WritableFile::Append() should fail
+ LegacyWritableFileWrapper* file =
+ static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
+ static_cast<FakeWF*>(file->target())->SetIOError(true);
+ ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b')));
+}
+#endif
+
+class ReadaheadRandomAccessFileTest
+ : public testing::Test,
+ public testing::WithParamInterface<size_t> {
+ public:
+ static std::vector<size_t> GetReadaheadSizeList() {
+ return {1lu << 12, 1lu << 16};
+ }
+ void SetUp() override {
+ readahead_size_ = GetParam();
+ scratch_.reset(new char[2 * readahead_size_]);
+ ResetSourceStr();
+ }
+ ReadaheadRandomAccessFileTest() : control_contents_() {}
+ std::string Read(uint64_t offset, size_t n) {
+ Slice result;
+ test_read_holder_->Read(offset, n, &result, scratch_.get());
+ return std::string(result.data(), result.size());
+ }
+ void ResetSourceStr(const std::string& str = "") {
+ auto write_holder =
+ std::unique_ptr<WritableFileWriter>(test::GetWritableFileWriter(
+ new test::StringSink(&control_contents_), "" /* don't care */));
+ write_holder->Append(Slice(str));
+ write_holder->Flush();
+ auto read_holder = std::unique_ptr<RandomAccessFile>(
+ new test::StringSource(control_contents_));
+ test_read_holder_ =
+ NewReadaheadRandomAccessFile(std::move(read_holder), readahead_size_);
+ }
+ size_t GetReadaheadSize() const { return readahead_size_; }
+
+ private:
+ size_t readahead_size_;
+ Slice control_contents_;
+ std::unique_ptr<RandomAccessFile> test_read_holder_;
+ std::unique_ptr<char[]> scratch_;
+};
+
+TEST_P(ReadaheadRandomAccessFileTest, EmptySourceStr) {
+ ASSERT_EQ("", Read(0, 1));
+ ASSERT_EQ("", Read(0, 0));
+ ASSERT_EQ("", Read(13, 13));
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSize) {
+ std::string str = "abcdefghijklmnopqrs";
+ ResetSourceStr(str);
+ ASSERT_EQ(str.substr(3, 4), Read(3, 4));
+ ASSERT_EQ(str.substr(0, 3), Read(0, 3));
+ ASSERT_EQ(str, Read(0, str.size()));
+ ASSERT_EQ(str.substr(7, std::min(static_cast<int>(str.size()) - 7, 30)),
+ Read(7, 30));
+ ASSERT_EQ("", Read(100, 100));
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenGreaterThanReadaheadSize) {
+ Random rng(42);
+ for (int k = 0; k < 100; ++k) {
+ size_t strLen = k * GetReadaheadSize() +
+ rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ std::string str =
+ test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+ ResetSourceStr(str);
+ for (int test = 1; test <= 100; ++test) {
+ size_t offset = rng.Uniform(static_cast<int>(strLen));
+ size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
+ Read(offset, n));
+ }
+ }
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, ReadExceedsReadaheadSize) {
+ Random rng(7);
+ size_t strLen = 4 * GetReadaheadSize() +
+ rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ std::string str =
+ test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+ ResetSourceStr(str);
+ for (int test = 1; test <= 100; ++test) {
+ size_t offset = rng.Uniform(static_cast<int>(strLen));
+ size_t n =
+ GetReadaheadSize() + rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
+ Read(offset, n));
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ EmptySourceStr, ReadaheadRandomAccessFileTest,
+ ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+ SourceStrLenLessThanReadaheadSize, ReadaheadRandomAccessFileTest,
+ ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+ SourceStrLenGreaterThanReadaheadSize, ReadaheadRandomAccessFileTest,
+ ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+ ReadExceedsReadaheadSize, ReadaheadRandomAccessFileTest,
+ ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+
+class ReadaheadSequentialFileTest : public testing::Test,
+ public testing::WithParamInterface<size_t> {
+ public:
+ static std::vector<size_t> GetReadaheadSizeList() {
+ return {1lu << 8, 1lu << 12, 1lu << 16, 1lu << 18};
+ }
+ void SetUp() override {
+ readahead_size_ = GetParam();
+ scratch_.reset(new char[2 * readahead_size_]);
+ ResetSourceStr();
+ }
+ ReadaheadSequentialFileTest() {}
+ std::string Read(size_t n) {
+ Slice result;
+ test_read_holder_->Read(n, &result, scratch_.get());
+ return std::string(result.data(), result.size());
+ }
+ void Skip(size_t n) { test_read_holder_->Skip(n); }
+ void ResetSourceStr(const std::string& str = "") {
+ auto read_holder = std::unique_ptr<SequentialFile>(
+ new test::SeqStringSource(str, &seq_read_count_));
+ test_read_holder_.reset(new SequentialFileReader(
+ NewLegacySequentialFileWrapper(read_holder), "test", readahead_size_));
+ }
+ size_t GetReadaheadSize() const { return readahead_size_; }
+
+ private:
+ size_t readahead_size_;
+ std::unique_ptr<SequentialFileReader> test_read_holder_;
+ std::unique_ptr<char[]> scratch_;
+ std::atomic<int> seq_read_count_;
+};
+
+TEST_P(ReadaheadSequentialFileTest, EmptySourceStr) {
+ ASSERT_EQ("", Read(0));
+ ASSERT_EQ("", Read(1));
+ ASSERT_EQ("", Read(13));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenLessThanReadaheadSize) {
+ std::string str = "abcdefghijklmnopqrs";
+ ResetSourceStr(str);
+ ASSERT_EQ(str.substr(0, 3), Read(3));
+ ASSERT_EQ(str.substr(3, 1), Read(1));
+ ASSERT_EQ(str.substr(4), Read(str.size()));
+ ASSERT_EQ("", Read(100));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenGreaterThanReadaheadSize) {
+ Random rng(42);
+ for (int s = 0; s < 1; ++s) {
+ for (int k = 0; k < 100; ++k) {
+ size_t strLen = k * GetReadaheadSize() +
+ rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ std::string str =
+ test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+ ResetSourceStr(str);
+ size_t offset = 0;
+ for (int test = 1; test <= 100; ++test) {
+ size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ if (s && test % 2) {
+ Skip(n);
+ } else {
+ ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+ }
+ offset = std::min(offset + n, strLen);
+ }
+ }
+ }
+}
+
+TEST_P(ReadaheadSequentialFileTest, ReadExceedsReadaheadSize) {
+ Random rng(42);
+ for (int s = 0; s < 1; ++s) {
+ for (int k = 0; k < 100; ++k) {
+ size_t strLen = k * GetReadaheadSize() +
+ rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ std::string str =
+ test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+ ResetSourceStr(str);
+ size_t offset = 0;
+ for (int test = 1; test <= 100; ++test) {
+ size_t n = GetReadaheadSize() +
+ rng.Uniform(static_cast<int>(GetReadaheadSize()));
+ if (s && test % 2) {
+ Skip(n);
+ } else {
+ ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+ }
+ offset = std::min(offset + n, strLen);
+ }
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ EmptySourceStr, ReadaheadSequentialFileTest,
+ ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+ SourceStrLenLessThanReadaheadSize, ReadaheadSequentialFileTest,
+ ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+ SourceStrLenGreaterThanReadaheadSize, ReadaheadSequentialFileTest,
+ ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+ ReadExceedsReadaheadSize, ReadaheadSequentialFileTest,
+ ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/filelock_test.cc b/src/rocksdb/util/filelock_test.cc
new file mode 100644
index 000000000..20efd513f
--- /dev/null
+++ b/src/rocksdb/util/filelock_test.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "rocksdb/status.h"
+#include "rocksdb/env.h"
+
+#include <fcntl.h>
+#include <vector>
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LockTest : public testing::Test {
+ public:
+ static LockTest* current_;
+ std::string file_;
+ ROCKSDB_NAMESPACE::Env* env_;
+
+ LockTest()
+ : file_(test::PerThreadDBPath("db_testlock_file")),
+ env_(ROCKSDB_NAMESPACE::Env::Default()) {
+ current_ = this;
+ }
+
+ ~LockTest() override {}
+
+ Status LockFile(FileLock** db_lock) {
+ return env_->LockFile(file_, db_lock);
+ }
+
+ Status UnlockFile(FileLock* db_lock) {
+ return env_->UnlockFile(db_lock);
+ }
+
+ bool AssertFileIsLocked(){
+ return CheckFileLock( /* lock_expected = */ true);
+ }
+
+ bool AssertFileIsNotLocked(){
+ return CheckFileLock( /* lock_expected = */ false);
+ }
+
+ bool CheckFileLock(bool lock_expected){
+ // We need to fork to check the fcntl lock as we need
+ // to open and close the file from a different process
+ // to avoid either releasing the lock on close, or not
+ // contending for it when requesting a lock.
+
+#ifdef OS_WIN
+
+ // WaitForSingleObject and GetExitCodeProcess can do what waitpid does.
+ // TODO - implement on Windows
+ return true;
+
+#else
+
+ pid_t pid = fork();
+ if ( 0 == pid ) {
+ // child process
+ int exit_val = EXIT_FAILURE;
+ int fd = open(file_.c_str(), O_RDWR | O_CREAT, 0644);
+ if (fd < 0) {
+ // could not open file, could not check if it was locked
+ fprintf( stderr, "Open on on file %s failed.\n",file_.c_str());
+ exit(exit_val);
+ }
+
+ struct flock f;
+ memset(&f, 0, sizeof(f));
+ f.l_type = (F_WRLCK);
+ f.l_whence = SEEK_SET;
+ f.l_start = 0;
+ f.l_len = 0; // Lock/unlock entire file
+ int value = fcntl(fd, F_SETLK, &f);
+ if( value == -1 ){
+ if( lock_expected ){
+ exit_val = EXIT_SUCCESS;
+ }
+ } else {
+ if( ! lock_expected ){
+ exit_val = EXIT_SUCCESS;
+ }
+ }
+ close(fd); // lock is released for child process
+ exit(exit_val);
+ } else if (pid > 0) {
+ // parent process
+ int status;
+ while (-1 == waitpid(pid, &status, 0));
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ // child process exited with non success status
+ return false;
+ } else {
+ return true;
+ }
+ } else {
+ fprintf( stderr, "Fork failed\n" );
+ return false;
+ }
+ return false;
+
+#endif
+
+ }
+
+};
+LockTest* LockTest::current_;
+
+TEST_F(LockTest, LockBySameThread) {
+ FileLock* lock1;
+ FileLock* lock2;
+
+ // acquire a lock on a file
+ ASSERT_OK(LockFile(&lock1));
+
+ // check the file is locked
+ ASSERT_TRUE( AssertFileIsLocked() );
+
+ // re-acquire the lock on the same file. This should fail.
+ ASSERT_TRUE(LockFile(&lock2).IsIOError());
+
+ // check the file is locked
+ ASSERT_TRUE( AssertFileIsLocked() );
+
+ // release the lock
+ ASSERT_OK(UnlockFile(lock1));
+
+ // check the file is not locked
+ ASSERT_TRUE( AssertFileIsNotLocked() );
+
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/filter_bench.cc b/src/rocksdb/util/filter_bench.cc
new file mode 100644
index 000000000..b474eee48
--- /dev/null
+++ b/src/rocksdb/util/filter_bench.cc
@@ -0,0 +1,751 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+ fprintf(stderr, "filter_bench requires gflags and !ROCKSDB_LITE\n");
+ return 1;
+}
+#else
+
+#include <cinttypes>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "memory/arena.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/plain/plain_table_bloom.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/stderr_logger.h"
+#include "util/stop_watch.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_uint32(seed, 0, "Seed for random number generators");
+
+DEFINE_double(working_mem_size_mb, 200,
+ "MB of memory to get up to among all filters, unless "
+ "m_keys_total_max is specified.");
+
+DEFINE_uint32(average_keys_per_filter, 10000,
+ "Average number of keys per filter");
+
+DEFINE_double(vary_key_count_ratio, 0.4,
+ "Vary number of keys by up to +/- vary_key_count_ratio * "
+ "average_keys_per_filter.");
+
+DEFINE_uint32(key_size, 24, "Average number of bytes for each key");
+
+DEFINE_bool(vary_key_alignment, true,
+ "Whether to vary key alignment (default: at least 32-bit "
+ "alignment)");
+
+DEFINE_uint32(vary_key_size_log2_interval, 5,
+ "Use same key size 2^n times, then change. Key size varies from "
+ "-2 to +2 bytes vs. average, unless n>=30 to fix key size.");
+
+DEFINE_uint32(batch_size, 8, "Number of keys to group in each batch");
+
+DEFINE_double(bits_per_key, 10.0, "Bits per key setting for filters");
+
+DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
+
+DEFINE_double(m_keys_total_max, 0,
+ "Maximum total keys added to filters, in millions. "
+ "0 (default) disables. Non-zero overrides working_mem_size_mb "
+ "option.");
+
+DEFINE_bool(use_full_block_reader, false,
+ "Use FullFilterBlockReader interface rather than FilterBitsReader");
+
+DEFINE_bool(use_plain_table_bloom, false,
+ "Use PlainTableBloom structure and interface rather than "
+ "FilterBitsReader/FullFilterBlockReader");
+
+DEFINE_bool(new_builder, false,
+ "Whether to create a new builder for each new filter");
+
+DEFINE_uint32(impl, 0,
+ "Select filter implementation. Without -use_plain_table_bloom:"
+ "0 = full filter, 1 = block-based filter. With "
+ "-use_plain_table_bloom: 0 = no locality, 1 = locality.");
+
+DEFINE_bool(net_includes_hashing, false,
+ "Whether query net ns/op times should include hashing. "
+ "(if not, dry run will include hashing) "
+ "(build times always include hashing)");
+
+DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
+
+DEFINE_bool(best_case, false, "Run limited tests only for best-case");
+
+DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad");
+
+DEFINE_bool(legend, false,
+ "Print more information about interpreting results instead of "
+ "running tests");
+
+DEFINE_uint32(runs, 1, "Number of times to rebuild and run benchmark tests");
+
+void _always_assert_fail(int line, const char *file, const char *expr) {
+ fprintf(stderr, "%s: %d: Assertion %s failed\n", file, line, expr);
+ abort();
+}
+
+#define ALWAYS_ASSERT(cond) \
+ ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond))
+
+#ifndef NDEBUG
+// This could affect build times enough that we should not include it for
+// accurate speed tests
+#define PREDICT_FP_RATE
+#endif
+
+using ROCKSDB_NAMESPACE::Arena;
+using ROCKSDB_NAMESPACE::BlockContents;
+using ROCKSDB_NAMESPACE::BloomFilterPolicy;
+using ROCKSDB_NAMESPACE::BloomHash;
+using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder;
+using ROCKSDB_NAMESPACE::CachableEntry;
+using ROCKSDB_NAMESPACE::EncodeFixed32;
+using ROCKSDB_NAMESPACE::fastrange32;
+using ROCKSDB_NAMESPACE::FilterBitsReader;
+using ROCKSDB_NAMESPACE::FilterBuildingContext;
+using ROCKSDB_NAMESPACE::FullFilterBlockReader;
+using ROCKSDB_NAMESPACE::GetSliceHash;
+using ROCKSDB_NAMESPACE::GetSliceHash64;
+using ROCKSDB_NAMESPACE::Lower32of64;
+using ROCKSDB_NAMESPACE::ParsedFullFilterBlock;
+using ROCKSDB_NAMESPACE::PlainTableBloomV1;
+using ROCKSDB_NAMESPACE::Random32;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::StderrLogger;
+using ROCKSDB_NAMESPACE::mock::MockBlockBasedTableTester;
+
+struct KeyMaker {
+ KeyMaker(size_t avg_size)
+ : smallest_size_(avg_size -
+ (FLAGS_vary_key_size_log2_interval >= 30 ? 2 : 0)),
+ buf_size_(avg_size + 11), // pad to vary key size and alignment
+ buf_(new char[buf_size_]) {
+ memset(buf_.get(), 0, buf_size_);
+ assert(smallest_size_ > 8);
+ }
+ size_t smallest_size_;
+ size_t buf_size_;
+ std::unique_ptr<char[]> buf_;
+
+ // Returns a unique(-ish) key based on the given parameter values. Each
+ // call returns a Slice from the same buffer so previously returned
+ // Slices should be considered invalidated.
+ Slice Get(uint32_t filter_num, uint32_t val_num) {
+ size_t start = FLAGS_vary_key_alignment ? val_num % 4 : 0;
+ size_t len = smallest_size_;
+ if (FLAGS_vary_key_size_log2_interval < 30) {
+ // To get range [avg_size - 2, avg_size + 2]
+ // use range [smallest_size, smallest_size + 4]
+ len += fastrange32(
+ (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5);
+ }
+ char * data = buf_.get() + start;
+ // Populate key data such that all data makes it into a key of at
+ // least 8 bytes. We also don't want all the within-filter key
+ // variance confined to a contiguous 32 bits, because then a 32 bit
+ // hash function can "cheat" the false positive rate by
+ // approximating a perfect hash.
+ EncodeFixed32(data, val_num);
+ EncodeFixed32(data + 4, filter_num + val_num);
+ // ensure clearing leftovers from different alignment
+ EncodeFixed32(data + 8, 0);
+ return Slice(data, len);
+ }
+};
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+ fprintf(stdout,
+ "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+ fprintf(stdout,
+ "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+struct FilterInfo {
+ uint32_t filter_id_ = 0;
+ std::unique_ptr<const char[]> owner_;
+ Slice filter_;
+ uint32_t keys_added_ = 0;
+ std::unique_ptr<FilterBitsReader> reader_;
+ std::unique_ptr<FullFilterBlockReader> full_block_reader_;
+ std::unique_ptr<PlainTableBloomV1> plain_table_bloom_;
+ uint64_t outside_queries_ = 0;
+ uint64_t false_positives_ = 0;
+};
+
+enum TestMode {
+ kSingleFilter,
+ kBatchPrepared,
+ kBatchUnprepared,
+ kFiftyOneFilter,
+ kEightyTwentyFilter,
+ kRandomFilter,
+};
+
+static const std::vector<TestMode> allTestModes = {
+ kSingleFilter, kBatchPrepared, kBatchUnprepared,
+ kFiftyOneFilter, kEightyTwentyFilter, kRandomFilter,
+};
+
+static const std::vector<TestMode> quickTestModes = {
+ kSingleFilter,
+ kRandomFilter,
+};
+
+static const std::vector<TestMode> bestCaseTestModes = {
+ kSingleFilter,
+};
+
+const char *TestModeToString(TestMode tm) {
+ switch (tm) {
+ case kSingleFilter:
+ return "Single filter";
+ case kBatchPrepared:
+ return "Batched, prepared";
+ case kBatchUnprepared:
+ return "Batched, unprepared";
+ case kFiftyOneFilter:
+ return "Skewed 50% in 1%";
+ case kEightyTwentyFilter:
+ return "Skewed 80% in 20%";
+ case kRandomFilter:
+ return "Random filter";
+ }
+ return "Bad TestMode";
+}
+
+// Do just enough to keep some data dependence for the
+// compiler / CPU
+static uint32_t DryRunNoHash(Slice &s) {
+ uint32_t sz = static_cast<uint32_t>(s.size());
+ if (sz >= 4) {
+ return sz + s.data()[3];
+ } else {
+ return sz;
+ }
+}
+
+static uint32_t DryRunHash32(Slice &s) {
+ // Same perf characteristics as GetSliceHash()
+ return BloomHash(s);
+}
+
+static uint32_t DryRunHash64(Slice &s) {
+ return Lower32of64(GetSliceHash64(s));
+}
+
+struct FilterBench : public MockBlockBasedTableTester {
+ std::vector<KeyMaker> kms_;
+ std::vector<FilterInfo> infos_;
+ Random32 random_;
+ std::ostringstream fp_rate_report_;
+ Arena arena_;
+ StderrLogger stderr_logger_;
+ double m_queries_;
+
+ FilterBench()
+ : MockBlockBasedTableTester(new BloomFilterPolicy(
+ FLAGS_bits_per_key,
+ static_cast<BloomFilterPolicy::Mode>(FLAGS_impl))),
+ random_(FLAGS_seed),
+ m_queries_(0) {
+ for (uint32_t i = 0; i < FLAGS_batch_size; ++i) {
+ kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size);
+ }
+ ioptions_.info_log = &stderr_logger_;
+ }
+
+ void Go();
+
+ double RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+ TestMode mode);
+};
+
+void FilterBench::Go() {
+ if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) {
+ throw std::runtime_error(
+ "Can't combine -use_plain_table_bloom and -use_full_block_reader");
+ }
+ if (FLAGS_use_plain_table_bloom) {
+ if (FLAGS_impl > 1) {
+ throw std::runtime_error(
+ "-impl must currently be >= 0 and <= 1 for Plain table");
+ }
+ } else {
+ if (FLAGS_impl == 1) {
+ throw std::runtime_error(
+ "Block-based filter not currently supported by filter_bench");
+ }
+ if (FLAGS_impl > 2) {
+ throw std::runtime_error(
+ "-impl must currently be 0 or 2 for Block-based table");
+ }
+ }
+
+ if (FLAGS_vary_key_count_ratio < 0.0 || FLAGS_vary_key_count_ratio > 1.0) {
+ throw std::runtime_error("-vary_key_count_ratio must be >= 0.0 and <= 1.0");
+ }
+
+ // For example, average_keys_per_filter = 100, vary_key_count_ratio = 0.1.
+ // Varys up to +/- 10 keys. variance_range = 21 (generating value 0..20).
+ // variance_offset = 10, so value - offset average value is always 0.
+ const uint32_t variance_range =
+ 1 + 2 * static_cast<uint32_t>(FLAGS_vary_key_count_ratio *
+ FLAGS_average_keys_per_filter);
+ const uint32_t variance_offset = variance_range / 2;
+
+ const std::vector<TestMode> &testModes =
+ FLAGS_best_case ? bestCaseTestModes
+ : FLAGS_quick ? quickTestModes : allTestModes;
+
+ m_queries_ = FLAGS_m_queries;
+ double working_mem_size_mb = FLAGS_working_mem_size_mb;
+ if (FLAGS_quick) {
+ m_queries_ /= 7.0;
+ } else if (FLAGS_best_case) {
+ m_queries_ /= 3.0;
+ working_mem_size_mb /= 10.0;
+ }
+
+ std::cout << "Building..." << std::endl;
+
+ std::unique_ptr<BuiltinFilterBitsBuilder> builder;
+
+ size_t total_memory_used = 0;
+ size_t total_keys_added = 0;
+#ifdef PREDICT_FP_RATE
+ double weighted_predicted_fp_rate = 0.0;
+#endif
+ size_t max_total_keys;
+ size_t max_mem;
+ if (FLAGS_m_keys_total_max > 0) {
+ max_total_keys = static_cast<size_t>(1000000 * FLAGS_m_keys_total_max);
+ max_mem = SIZE_MAX;
+ } else {
+ max_total_keys = SIZE_MAX;
+ max_mem = static_cast<size_t>(1024 * 1024 * working_mem_size_mb);
+ }
+
+ ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
+ true);
+
+ infos_.clear();
+ while ((working_mem_size_mb == 0 || total_memory_used < max_mem) &&
+ total_keys_added < max_total_keys) {
+ uint32_t filter_id = random_.Next();
+ uint32_t keys_to_add = FLAGS_average_keys_per_filter +
+ fastrange32(random_.Next(), variance_range) -
+ variance_offset;
+ if (max_total_keys - total_keys_added < keys_to_add) {
+ keys_to_add = static_cast<uint32_t>(max_total_keys - total_keys_added);
+ }
+ infos_.emplace_back();
+ FilterInfo &info = infos_.back();
+ info.filter_id_ = filter_id;
+ info.keys_added_ = keys_to_add;
+ if (FLAGS_use_plain_table_bloom) {
+ info.plain_table_bloom_.reset(new PlainTableBloomV1());
+ info.plain_table_bloom_->SetTotalBits(
+ &arena_, static_cast<uint32_t>(keys_to_add * FLAGS_bits_per_key),
+ FLAGS_impl, 0 /*huge_page*/, nullptr /*logger*/);
+ for (uint32_t i = 0; i < keys_to_add; ++i) {
+ uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i));
+ info.plain_table_bloom_->AddHash(hash);
+ }
+ info.filter_ = info.plain_table_bloom_->GetRawData();
+ } else {
+ if (!builder) {
+ builder.reset(&dynamic_cast<BuiltinFilterBitsBuilder &>(*GetBuilder()));
+ }
+ for (uint32_t i = 0; i < keys_to_add; ++i) {
+ builder->AddKey(kms_[0].Get(filter_id, i));
+ }
+ info.filter_ = builder->Finish(&info.owner_);
+#ifdef PREDICT_FP_RATE
+ weighted_predicted_fp_rate +=
+ keys_to_add *
+ builder->EstimatedFpRate(keys_to_add, info.filter_.size());
+#endif
+ if (FLAGS_new_builder) {
+ builder.reset();
+ }
+ info.reader_.reset(
+ table_options_.filter_policy->GetFilterBitsReader(info.filter_));
+ CachableEntry<ParsedFullFilterBlock> block(
+ new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+ BlockContents(info.filter_)),
+ nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+ info.full_block_reader_.reset(
+ new FullFilterBlockReader(table_.get(), std::move(block)));
+ }
+ total_memory_used += info.filter_.size();
+ total_keys_added += keys_to_add;
+ }
+
+ uint64_t elapsed_nanos = timer.ElapsedNanos();
+ double ns = double(elapsed_nanos) / total_keys_added;
+ std::cout << "Build avg ns/key: " << ns << std::endl;
+ std::cout << "Number of filters: " << infos_.size() << std::endl;
+ std::cout << "Total memory (MB): " << total_memory_used / 1024.0 / 1024.0
+ << std::endl;
+
+ double bpk = total_memory_used * 8.0 / total_keys_added;
+ std::cout << "Bits/key actual: " << bpk << std::endl;
+#ifdef PREDICT_FP_RATE
+ std::cout << "Predicted FP rate %: "
+ << 100.0 * (weighted_predicted_fp_rate / total_keys_added)
+ << std::endl;
+#endif
+ if (!FLAGS_quick && !FLAGS_best_case) {
+ double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0));
+ std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk)
+ << std::endl;
+ std::cout << "Tolerable FP rate %: " << 100.0 * tolerable_rate << std::endl;
+
+ std::cout << "----------------------------" << std::endl;
+ std::cout << "Verifying..." << std::endl;
+
+ uint32_t outside_q_per_f =
+ static_cast<uint32_t>(m_queries_ * 1000000 / infos_.size());
+ uint64_t fps = 0;
+ for (uint32_t i = 0; i < infos_.size(); ++i) {
+ FilterInfo &info = infos_[i];
+ for (uint32_t j = 0; j < info.keys_added_; ++j) {
+ if (FLAGS_use_plain_table_bloom) {
+ uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
+ ALWAYS_ASSERT(info.plain_table_bloom_->MayContainHash(hash));
+ } else {
+ ALWAYS_ASSERT(
+ info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
+ }
+ }
+ for (uint32_t j = 0; j < outside_q_per_f; ++j) {
+ if (FLAGS_use_plain_table_bloom) {
+ uint32_t hash =
+ GetSliceHash(kms_[0].Get(info.filter_id_, j | 0x80000000));
+ fps += info.plain_table_bloom_->MayContainHash(hash);
+ } else {
+ fps += info.reader_->MayMatch(
+ kms_[0].Get(info.filter_id_, j | 0x80000000));
+ }
+ }
+ }
+ std::cout << " No FNs :)" << std::endl;
+ double prelim_rate = double(fps) / outside_q_per_f / infos_.size();
+ std::cout << " Prelim FP rate %: " << (100.0 * prelim_rate) << std::endl;
+
+ if (!FLAGS_allow_bad_fp_rate) {
+ ALWAYS_ASSERT(prelim_rate < tolerable_rate);
+ }
+ }
+
+ std::cout << "----------------------------" << std::endl;
+ std::cout << "Mixed inside/outside queries..." << std::endl;
+ // 50% each inside and outside
+ uint32_t inside_threshold = UINT32_MAX / 2;
+ for (TestMode tm : testModes) {
+ random_.Seed(FLAGS_seed + 1);
+ double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+ random_.Seed(FLAGS_seed + 1);
+ double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+ std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
+ << std::endl;
+ }
+
+ if (!FLAGS_quick) {
+ std::cout << "----------------------------" << std::endl;
+ std::cout << "Inside queries (mostly)..." << std::endl;
+ // Do about 95% inside queries rather than 100% so that branch predictor
+ // can't give itself an artifically crazy advantage.
+ inside_threshold = UINT32_MAX / 20 * 19;
+ for (TestMode tm : testModes) {
+ random_.Seed(FLAGS_seed + 1);
+ double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+ random_.Seed(FLAGS_seed + 1);
+ double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+ std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
+ << std::endl;
+ }
+
+ std::cout << "----------------------------" << std::endl;
+ std::cout << "Outside queries (mostly)..." << std::endl;
+ // Do about 95% outside queries rather than 100% so that branch predictor
+ // can't give itself an artifically crazy advantage.
+ inside_threshold = UINT32_MAX / 20;
+ for (TestMode tm : testModes) {
+ random_.Seed(FLAGS_seed + 2);
+ double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+ random_.Seed(FLAGS_seed + 2);
+ double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+ std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
+ << std::endl;
+ }
+ }
+ std::cout << fp_rate_report_.str();
+
+ std::cout << "----------------------------" << std::endl;
+ std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
+}
+
+double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+ TestMode mode) {
+ for (auto &info : infos_) {
+ info.outside_queries_ = 0;
+ info.false_positives_ = 0;
+ }
+
+ auto dry_run_hash_fn = DryRunNoHash;
+ if (!FLAGS_net_includes_hashing) {
+ if (FLAGS_impl < 2 || FLAGS_use_plain_table_bloom) {
+ dry_run_hash_fn = DryRunHash32;
+ } else {
+ dry_run_hash_fn = DryRunHash64;
+ }
+ }
+
+ uint32_t num_infos = static_cast<uint32_t>(infos_.size());
+ uint32_t dry_run_hash = 0;
+ uint64_t max_queries = static_cast<uint64_t>(m_queries_ * 1000000 + 0.50);
+ // Some filters may be considered secondary in order to implement skewed
+ // queries. num_primary_filters is the number that are to be treated as
+ // equal, and any remainder will be treated as secondary.
+ uint32_t num_primary_filters = num_infos;
+ // The proportion (when divided by 2^32 - 1) of filter queries going to
+ // the primary filters (default = all). The remainder of queries are
+ // against secondary filters.
+ uint32_t primary_filter_threshold = 0xffffffff;
+ if (mode == kSingleFilter) {
+ // 100% of queries to 1 filter
+ num_primary_filters = 1;
+ } else if (mode == kFiftyOneFilter) {
+ // 50% of queries
+ primary_filter_threshold /= 2;
+ // to 1% of filters
+ num_primary_filters = (num_primary_filters + 99) / 100;
+ } else if (mode == kEightyTwentyFilter) {
+ // 80% of queries
+ primary_filter_threshold = primary_filter_threshold / 5 * 4;
+ // to 20% of filters
+ num_primary_filters = (num_primary_filters + 4) / 5;
+ }
+ uint32_t batch_size = 1;
+ std::unique_ptr<Slice[]> batch_slices;
+ std::unique_ptr<Slice *[]> batch_slice_ptrs;
+ std::unique_ptr<bool[]> batch_results;
+ if (mode == kBatchPrepared || mode == kBatchUnprepared) {
+ batch_size = static_cast<uint32_t>(kms_.size());
+ }
+
+ batch_slices.reset(new Slice[batch_size]);
+ batch_slice_ptrs.reset(new Slice *[batch_size]);
+ batch_results.reset(new bool[batch_size]);
+ for (uint32_t i = 0; i < batch_size; ++i) {
+ batch_results[i] = false;
+ batch_slice_ptrs[i] = &batch_slices[i];
+ }
+
+ ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
+ true);
+
+ for (uint64_t q = 0; q < max_queries; q += batch_size) {
+ bool inside_this_time = random_.Next() <= inside_threshold;
+
+ uint32_t filter_index;
+ if (random_.Next() <= primary_filter_threshold) {
+ filter_index = random_.Uniformish(num_primary_filters);
+ } else {
+ // secondary
+ filter_index = num_primary_filters +
+ random_.Uniformish(num_infos - num_primary_filters);
+ }
+ FilterInfo &info = infos_[filter_index];
+ for (uint32_t i = 0; i < batch_size; ++i) {
+ if (inside_this_time) {
+ batch_slices[i] =
+ kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
+ } else {
+ batch_slices[i] =
+ kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_) |
+ uint32_t{0x80000000});
+ info.outside_queries_++;
+ }
+ }
+ // TODO: implement batched interface to full block reader
+ // TODO: implement batched interface to plain table bloom
+ if (mode == kBatchPrepared && !FLAGS_use_full_block_reader &&
+ !FLAGS_use_plain_table_bloom) {
+ for (uint32_t i = 0; i < batch_size; ++i) {
+ batch_results[i] = false;
+ }
+ if (dry_run) {
+ for (uint32_t i = 0; i < batch_size; ++i) {
+ batch_results[i] = true;
+ dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+ }
+ } else {
+ info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
+ batch_results.get());
+ }
+ for (uint32_t i = 0; i < batch_size; ++i) {
+ if (inside_this_time) {
+ ALWAYS_ASSERT(batch_results[i]);
+ } else {
+ info.false_positives_ += batch_results[i];
+ }
+ }
+ } else {
+ for (uint32_t i = 0; i < batch_size; ++i) {
+ bool may_match;
+ if (FLAGS_use_plain_table_bloom) {
+ if (dry_run) {
+ dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+ may_match = true;
+ } else {
+ uint32_t hash = GetSliceHash(batch_slices[i]);
+ may_match = info.plain_table_bloom_->MayContainHash(hash);
+ }
+ } else if (FLAGS_use_full_block_reader) {
+ if (dry_run) {
+ dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+ may_match = true;
+ } else {
+ may_match = info.full_block_reader_->KeyMayMatch(
+ batch_slices[i],
+ /*prefix_extractor=*/nullptr,
+ /*block_offset=*/ROCKSDB_NAMESPACE::kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr);
+ }
+ } else {
+ if (dry_run) {
+ dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+ may_match = true;
+ } else {
+ may_match = info.reader_->MayMatch(batch_slices[i]);
+ }
+ }
+ if (inside_this_time) {
+ ALWAYS_ASSERT(may_match);
+ } else {
+ info.false_positives_ += may_match;
+ }
+ }
+ }
+ }
+
+ uint64_t elapsed_nanos = timer.ElapsedNanos();
+ double ns = double(elapsed_nanos) / max_queries;
+
+ if (!FLAGS_quick) {
+ if (dry_run) {
+ // Printing part of hash prevents dry run components from being optimized
+ // away by compiler
+ std::cout << " Dry run (" << std::hex << (dry_run_hash & 0xfffff)
+ << std::dec << ") ";
+ } else {
+ std::cout << " Gross filter ";
+ }
+ std::cout << "ns/op: " << ns << std::endl;
+ }
+
+ if (!dry_run) {
+ fp_rate_report_.str("");
+ uint64_t q = 0;
+ uint64_t fp = 0;
+ double worst_fp_rate = 0.0;
+ double best_fp_rate = 1.0;
+ for (auto &info : infos_) {
+ q += info.outside_queries_;
+ fp += info.false_positives_;
+ if (info.outside_queries_ > 0) {
+ double fp_rate = double(info.false_positives_) / info.outside_queries_;
+ worst_fp_rate = std::max(worst_fp_rate, fp_rate);
+ best_fp_rate = std::min(best_fp_rate, fp_rate);
+ }
+ }
+ fp_rate_report_ << " Average FP rate %: " << 100.0 * fp / q << std::endl;
+ if (!FLAGS_quick && !FLAGS_best_case) {
+ fp_rate_report_ << " Worst FP rate %: " << 100.0 * worst_fp_rate
+ << std::endl;
+ fp_rate_report_ << " Best FP rate %: " << 100.0 * best_fp_rate
+ << std::endl;
+ fp_rate_report_ << " Best possible bits/key: "
+ << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
+ }
+ }
+ return ns;
+}
+
+int main(int argc, char **argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+ " [-quick] [OTHER OPTIONS]...");
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ PrintWarnings();
+
+ if (FLAGS_legend) {
+ std::cout
+ << "Legend:" << std::endl
+ << " \"Inside\" - key that was added to filter" << std::endl
+ << " \"Outside\" - key that was not added to filter" << std::endl
+ << " \"FN\" - false negative query (must not happen)" << std::endl
+ << " \"FP\" - false positive query (OK at low rate)" << std::endl
+ << " \"Dry run\" - cost of testing and hashing overhead." << std::endl
+ << " \"Gross filter\" - cost of filter queries including testing "
+ << "\n and hashing overhead." << std::endl
+ << " \"net\" - best estimate of time in filter operation, without "
+ << "\n testing and hashing overhead (gross filter - dry run)"
+ << std::endl
+ << " \"ns/op\" - nanoseconds per operation (key query or add)"
+ << std::endl
+ << " \"Single filter\" - essentially minimum cost, assuming filter"
+ << "\n fits easily in L1 CPU cache." << std::endl
+ << " \"Batched, prepared\" - several queries at once against a"
+ << "\n randomly chosen filter, using multi-query interface."
+ << std::endl
+ << " \"Batched, unprepared\" - similar, but using serial calls"
+ << "\n to single query interface." << std::endl
+ << " \"Random filter\" - a filter is chosen at random as target"
+ << "\n of each query." << std::endl
+ << " \"Skewed X% in Y%\" - like \"Random filter\" except Y% of"
+ << "\n the filters are designated as \"hot\" and receive X%"
+ << "\n of queries." << std::endl;
+ } else {
+ FilterBench b;
+ for (uint32_t i = 0; i < FLAGS_runs; ++i) {
+ b.Go();
+ FLAGS_seed += 100;
+ b.random_.Seed(FLAGS_seed);
+ }
+ }
+
+ return 0;
+}
+
+#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/util/gflags_compat.h b/src/rocksdb/util/gflags_compat.h
new file mode 100644
index 000000000..d5a30ce7e
--- /dev/null
+++ b/src/rocksdb/util/gflags_compat.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <gflags/gflags.h>
+
+#ifndef GFLAGS_NAMESPACE
+// in case it's not defined in old versions, that's probably because it was
+// still google by default.
+#define GFLAGS_NAMESPACE google
+#endif
+
+#ifndef DEFINE_uint32
+// DEFINE_uint32 does not appear in older versions of gflags. This should be
+// a sane definition for those versions.
+#define DEFINE_uint32(name, val, txt) \
+ DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint32, U, name, val, txt)
+#endif
diff --git a/src/rocksdb/util/hash.cc b/src/rocksdb/util/hash.cc
new file mode 100644
index 000000000..d72be8a45
--- /dev/null
+++ b/src/rocksdb/util/hash.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string.h>
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint32_t Hash(const char* data, size_t n, uint32_t seed) {
+ // MurmurHash1 - fast but mediocre quality
+ // https://github.com/aappleby/smhasher/wiki/MurmurHash1
+ //
+ const uint32_t m = 0xc6a4a793;
+ const uint32_t r = 24;
+ const char* limit = data + n;
+ uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
+
+ // Pick up four bytes at a time
+ while (data + 4 <= limit) {
+ uint32_t w = DecodeFixed32(data);
+ data += 4;
+ h += w;
+ h *= m;
+ h ^= (h >> 16);
+ }
+
+ // Pick up remaining bytes
+ switch (limit - data) {
+ // Note: The original hash implementation used data[i] << shift, which
+ // promotes the char to int and then performs the shift. If the char is
+ // negative, the shift is undefined behavior in C++. The hash algorithm is
+ // part of the format definition, so we cannot change it; to obtain the same
+ // behavior in a legal way we just cast to uint32_t, which will do
+ // sign-extension. To guarantee compatibility with architectures where chars
+ // are unsigned we first cast the char to int8_t.
+ case 3:
+ h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
+ FALLTHROUGH_INTENDED;
+ case 2:
+ h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
+ FALLTHROUGH_INTENDED;
+ case 1:
+ h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
+ h *= m;
+ h ^= (h >> r);
+ break;
+ }
+ return h;
+}
+
+// We are standardizing on a preview release of XXH3, because that's
+// the best available at time of standardizing.
+//
+// In testing (mostly Intel Skylake), this hash function is much more
+// thorough than Hash32 and is almost universally faster. Hash() only
+// seems faster when passing runtime-sized keys of the same small size
+// (less than about 24 bytes) thousands of times in a row; this seems
+// to allow the branch predictor to work some magic. XXH3's speed is
+// much less dependent on branch prediction.
+//
+// Hashing with a prefix extractor is potentially a common case of
+// hashing objects of small, predictable size. We could consider
+// bundling hash functions specialized for particular lengths with
+// the prefix extractors.
+uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
+ return XXH3p_64bits_withSeed(data, n, seed);
+}
+
+uint64_t Hash64(const char* data, size_t n) {
+ // Same as seed = 0
+ return XXH3p_64bits(data, n);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/hash.h b/src/rocksdb/util/hash.h
new file mode 100644
index 000000000..17490a366
--- /dev/null
+++ b/src/rocksdb/util/hash.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Common hash functions with convenient interfaces. If hashing a
+// statically-sized input in a performance-critical context, consider
+// calling a specific hash implementation directly, such as
+// XXH3p_64bits from xxhash.h.
+//
+// Since this is a very common header, implementation details are kept
+// out-of-line. Out-of-lining also aids in tracking the time spent in
+// hashing functions. Inlining is of limited benefit for runtime-sized
+// hash inputs.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Stable/persistent 64-bit hash. Higher quality and generally faster than
+// Hash(), especially for inputs > 24 bytes.
+extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
+
+// Specific optimization without seed (same as seed = 0)
+extern uint64_t Hash64(const char* data, size_t n);
+
+// Non-persistent hash. Must only used for in-memory data structure.
+// The hash results are thus applicable to change. (Thus, it rarely makes
+// sense to specify a seed for this function.)
+inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
+ // Currently same as Hash64
+ return Hash64(data, n, seed);
+}
+
+// Specific optimization without seed (same as seed = 0)
+inline uint64_t NPHash64(const char* data, size_t n) {
+ // Currently same as Hash64
+ return Hash64(data, n);
+}
+
+// Stable/persistent 32-bit hash. Moderate quality and high speed on
+// small inputs.
+// TODO: consider rename to Hash32
+extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
+
+// TODO: consider rename to LegacyBloomHash32
+inline uint32_t BloomHash(const Slice& key) {
+ return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+
+inline uint64_t GetSliceHash64(const Slice& key) {
+ return Hash64(key.data(), key.size());
+}
+
+inline uint64_t GetSliceNPHash64(const Slice& s) {
+ return NPHash64(s.data(), s.size());
+}
+
+// TODO: consider rename to GetSliceHash32
+inline uint32_t GetSliceHash(const Slice& s) {
+ return Hash(s.data(), s.size(), 397);
+}
+
+// Useful for splitting up a 64-bit hash
+inline uint32_t Upper32of64(uint64_t v) {
+ return static_cast<uint32_t>(v >> 32);
+}
+inline uint32_t Lower32of64(uint64_t v) { return static_cast<uint32_t>(v); }
+
+// std::hash compatible interface.
+// TODO: consider rename to SliceHasher32
+struct SliceHasher {
+ uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
+};
+
+// An alternative to % for mapping a hash value to an arbitrary range. See
+// https://github.com/lemire/fastrange
+inline uint32_t fastrange32(uint32_t hash, uint32_t range) {
+ uint64_t product = uint64_t{range} * hash;
+ return static_cast<uint32_t>(product >> 32);
+}
+
+// An alternative to % for mapping a 64-bit hash value to an arbitrary range
+// that fits in size_t. See https://github.com/lemire/fastrange
+// We find size_t more convenient than uint64_t for the range, with side
+// benefit of better optimization on 32-bit platforms.
+inline size_t fastrange64(uint64_t hash, size_t range) {
+#if defined(HAVE_UINT128_EXTENSION)
+ // Can use compiler's 128-bit type. Trust it to do the right thing.
+ __uint128_t wide = __uint128_t{range} * hash;
+ return static_cast<size_t>(wide >> 64);
+#else
+ // Fall back: full decomposition.
+ // NOTE: GCC seems to fully understand this code as 64-bit x {32 or 64}-bit
+ // -> {96 or 128}-bit multiplication and optimize it down to a single
+ // wide-result multiplication (64-bit platform) or two wide-result
+ // multiplications (32-bit platforms, where range64 >> 32 is zero).
+ uint64_t range64 = range; // ok to shift by 32, even if size_t is 32-bit
+ uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF};
+ tmp >>= 32;
+ tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32};
+ // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+ uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF};
+ tmp += static_cast<uint32_t>(tmp2);
+ tmp >>= 32;
+ tmp += (tmp2 >> 32);
+ tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32};
+ return static_cast<size_t>(tmp);
+#endif
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/hash_map.h b/src/rocksdb/util/hash_map.h
new file mode 100644
index 000000000..6eb42506b
--- /dev/null
+++ b/src/rocksdb/util/hash_map.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <utility>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is similar to std::unordered_map, except that it tries to avoid
+// allocating or deallocating memory as much as possible. With
+// std::unordered_map, an allocation/deallocation is made for every insertion
+// or deletion because of the requirement that iterators remain valid even
+// with insertions or deletions. This means that the hash chains will be
+// implemented as linked lists.
+//
+// This implementation uses autovector as hash chains insteads.
+//
+template <typename K, typename V, size_t size = 128>
+class HashMap {
+ std::array<autovector<std::pair<K, V>, 1>, size> table_;
+
+ public:
+ bool Contains(K key) {
+ auto& bucket = table_[key % size];
+ auto it = std::find_if(
+ bucket.begin(), bucket.end(),
+ [key](const std::pair<K, V>& p) { return p.first == key; });
+ return it != bucket.end();
+ }
+
+ void Insert(K key, V value) {
+ auto& bucket = table_[key % size];
+ bucket.push_back({key, value});
+ }
+
+ void Delete(K key) {
+ auto& bucket = table_[key % size];
+ auto it = std::find_if(
+ bucket.begin(), bucket.end(),
+ [key](const std::pair<K, V>& p) { return p.first == key; });
+ if (it != bucket.end()) {
+ auto last = bucket.end() - 1;
+ if (it != last) {
+ *it = *last;
+ }
+ bucket.pop_back();
+ }
+ }
+
+ V& Get(K key) {
+ auto& bucket = table_[key % size];
+ auto it = std::find_if(
+ bucket.begin(), bucket.end(),
+ [key](const std::pair<K, V>& p) { return p.first == key; });
+ return it->second;
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/hash_test.cc b/src/rocksdb/util/hash_test.cc
new file mode 100644
index 000000000..9c3c6efe9
--- /dev/null
+++ b/src/rocksdb/util/hash_test.cc
@@ -0,0 +1,377 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+#include <vector>
+
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+using ROCKSDB_NAMESPACE::EncodeFixed32;
+using ROCKSDB_NAMESPACE::GetSliceHash64;
+using ROCKSDB_NAMESPACE::Hash;
+using ROCKSDB_NAMESPACE::Hash64;
+using ROCKSDB_NAMESPACE::Lower32of64;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Upper32of64;
+
+// The hash algorithm is part of the file format, for example for the Bloom
+// filters. Test that the hash values are stable for a set of random strings of
+// varying lengths.
+TEST(HashTest, Values) {
+ constexpr uint32_t kSeed = 0xbc9f1d34; // Same as BloomHash.
+
+ EXPECT_EQ(Hash("", 0, kSeed), 3164544308u);
+ EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524u);
+ EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998u);
+ EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349u);
+ EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383u);
+ EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956u);
+ EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105u);
+ EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776u);
+ EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603u);
+ EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797u);
+ EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265u);
+ EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661u);
+ EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222u);
+ EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265u);
+ EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611u);
+ EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572u);
+ EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355u);
+ EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370u);
+ EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402u);
+ EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743u);
+ EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500u);
+ EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008u);
+ EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482u);
+ EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422u);
+ EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029u);
+ EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748u);
+ EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698u);
+ EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227u);
+ EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252u);
+ EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402u);
+ EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698u);
+ EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809u);
+ EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120u);
+ EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+ 2706087434u);
+ EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+ 1534654151u);
+ EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+ 2355554696u);
+ EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+ 1400800912u);
+ EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
+ 3420325137u);
+ EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
+ 3427803584u);
+ EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
+ 1152407945u);
+ EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
+ 3382479516u);
+}
+
+// The hash algorithm is part of the file format, for example for the Bloom
+// filters.
+TEST(HashTest, Hash64Misc) {
+ constexpr uint32_t kSeed = 0; // Same as GetSliceHash64
+
+ for (char fill : {'\0', 'a', '1', '\xff'}) {
+ const size_t max_size = 1000;
+ const std::string str(max_size, fill);
+
+ for (size_t size = 0; size <= max_size; ++size) {
+ uint64_t here = Hash64(str.data(), size, kSeed);
+
+ // Must be same as GetSliceHash64
+ EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size)));
+
+ // Upper and Lower must reconstruct hash
+ EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) | Lower32of64(here));
+ EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) + Lower32of64(here));
+ EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) ^ Lower32of64(here));
+
+ // Seed changes hash value (with high probability)
+ for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
+ EXPECT_NE(here, Hash64(str.data(), size, var_seed));
+ }
+
+ // Size changes hash value (with high probability)
+ size_t max_smaller_by = std::min(size_t{30}, size);
+ for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
+ EXPECT_NE(here, Hash64(str.data(), size - smaller_by, kSeed));
+ }
+ }
+ }
+}
+
+// Test that hash values are "non-trivial" for "trivial" inputs
+TEST(HashTest, Hash64Trivial) {
+ // Thorough test too slow for regression testing
+ constexpr bool thorough = false;
+
+ // For various seeds, make sure hash of empty string is not zero.
+ constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
+ for (uint64_t seed = 0; seed < max_seed; ++seed) {
+ uint64_t here = Hash64("", 0, seed);
+ EXPECT_NE(Lower32of64(here), 0u);
+ EXPECT_NE(Upper32of64(here), 0u);
+ }
+
+ // For standard seed, make sure hash of small strings are not zero
+ constexpr uint32_t kSeed = 0; // Same as GetSliceHash64
+ char input[4];
+ constexpr int max_len = thorough ? 3 : 2;
+ for (int len = 1; len <= max_len; ++len) {
+ for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
+ EncodeFixed32(input, i);
+ uint64_t here = Hash64(input, len, kSeed);
+ EXPECT_NE(Lower32of64(here), 0u);
+ EXPECT_NE(Upper32of64(here), 0u);
+ }
+ }
+}
+
+// Test that the hash values are stable for a set of random strings of
+// varying small lengths.
+TEST(HashTest, Hash64SmallValueSchema) {
+ constexpr uint32_t kSeed = 0; // Same as GetSliceHash64
+
+ EXPECT_EQ(Hash64("", 0, kSeed), uint64_t{5999572062939766020u});
+ EXPECT_EQ(Hash64("\x08", 1, kSeed), uint64_t{583283813901344696u});
+ EXPECT_EQ(Hash64("\x17", 1, kSeed), uint64_t{16175549975585474943u});
+ EXPECT_EQ(Hash64("\x9a", 1, kSeed), uint64_t{16322991629225003903u});
+ EXPECT_EQ(Hash64("\x1c", 1, kSeed), uint64_t{13269285487706833447u});
+ EXPECT_EQ(Hash64("\x4d\x76", 2, kSeed), uint64_t{6859542833406258115u});
+ EXPECT_EQ(Hash64("\x52\xd5", 2, kSeed), uint64_t{4919611532550636959u});
+ EXPECT_EQ(Hash64("\x91\xf7", 2, kSeed), uint64_t{14199427467559720719u});
+ EXPECT_EQ(Hash64("\xd6\x27", 2, kSeed), uint64_t{12292689282614532691u});
+ EXPECT_EQ(Hash64("\x30\x46\x0b", 3, kSeed), uint64_t{11404699285340020889u});
+ EXPECT_EQ(Hash64("\x56\xdc\xd6", 3, kSeed), uint64_t{12404347133785524237u});
+ EXPECT_EQ(Hash64("\xd4\x52\x33", 3, kSeed), uint64_t{15853805298481534034u});
+ EXPECT_EQ(Hash64("\x6a\xb5\xf4", 3, kSeed), uint64_t{16863488758399383382u});
+ EXPECT_EQ(Hash64("\x67\x53\x81\x1c", 4, kSeed),
+ uint64_t{9010661983527562386u});
+ EXPECT_EQ(Hash64("\x69\xb8\xc0\x88", 4, kSeed),
+ uint64_t{6611781377647041447u});
+ EXPECT_EQ(Hash64("\x1e\x84\xaf\x2d", 4, kSeed),
+ uint64_t{15290969111616346501u});
+ EXPECT_EQ(Hash64("\x46\xdc\x54\xbe", 4, kSeed),
+ uint64_t{7063754590279313623u});
+ EXPECT_EQ(Hash64("\xd0\x7a\x6e\xea\x56", 5, kSeed),
+ uint64_t{6384167718754869899u});
+ EXPECT_EQ(Hash64("\x86\x83\xd5\xa4\xd8", 5, kSeed),
+ uint64_t{16874407254108011067u});
+ EXPECT_EQ(Hash64("\xb7\x46\xbb\x77\xce", 5, kSeed),
+ uint64_t{16809880630149135206u});
+ EXPECT_EQ(Hash64("\x6c\xa8\xbc\xe5\x99", 5, kSeed),
+ uint64_t{1249038833153141148u});
+ EXPECT_EQ(Hash64("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed),
+ uint64_t{17358142495308219330u});
+ EXPECT_EQ(Hash64("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed),
+ uint64_t{4237646583134806322u});
+ EXPECT_EQ(Hash64("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed),
+ uint64_t{4373664924115234051u});
+ EXPECT_EQ(Hash64("\x73\xe1\xff\x56\x9c\xce", 6, kSeed),
+ uint64_t{12012981210634596029u});
+ EXPECT_EQ(Hash64("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed),
+ uint64_t{5716522398211028826u});
+ EXPECT_EQ(Hash64("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed),
+ uint64_t{15604531309862565013u});
+ EXPECT_EQ(Hash64("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed),
+ uint64_t{8601330687345614172u});
+ EXPECT_EQ(Hash64("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed),
+ uint64_t{8088079329364056942u});
+ EXPECT_EQ(Hash64("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed),
+ uint64_t{9844314944338447628u});
+ EXPECT_EQ(Hash64("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed),
+ uint64_t{10973293517982163143u});
+ EXPECT_EQ(Hash64("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed),
+ uint64_t{9986007080564743219u});
+ EXPECT_EQ(Hash64("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed),
+ uint64_t{1729303145008254458u});
+ EXPECT_EQ(Hash64("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+ uint64_t{13253403748084181481u});
+ EXPECT_EQ(Hash64("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+ uint64_t{7768754303876232188u});
+ EXPECT_EQ(Hash64("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+ uint64_t{12439346786701492u});
+ EXPECT_EQ(Hash64("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+ uint64_t{10841838338450144690u});
+ EXPECT_EQ(Hash64("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
+ uint64_t{12883919702069153152u});
+ EXPECT_EQ(Hash64("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
+ uint64_t{12692903507676842188u});
+ EXPECT_EQ(Hash64("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
+ uint64_t{6540985900674032620u});
+ EXPECT_EQ(Hash64("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
+ uint64_t{10551812464348219044u});
+}
+
+std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
+ const char *mod61_encode =
+ "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+ std::string input;
+ while (input.size() < limit) {
+ input.append(repeat);
+ }
+ std::string rv;
+ for (size_t i = 0; i < limit; ++i) {
+ uint64_t h = GetSliceHash64(Slice(input.data(), i));
+ rv.append(1, mod61_encode[static_cast<size_t>(h % 61)]);
+ }
+ return rv;
+}
+
+// XXH3p changes its algorithm for various sizes up through 250 bytes, so
+// we need to check the stability of larger sizes also.
+TEST(HashTest, Hash64LargeValueSchema) {
+ // Each of these derives a "descriptor" from the hash values for all
+ // lengths up to 430.
+ // Note that "c" is common for the zero-length string.
+ EXPECT_EQ(
+ Hash64TestDescriptor("foo", 430),
+ "cRhyWsY67B6klRA1udmOuiYuX7IthyGBKqbeosz2hzVglWCmQx8nEdnpkvPfYX56Up2OWOTV"
+ "lTzfAoYwvtqKzjD8E9xttR2unelbXbIV67NUe6bOO23BxaSFRcA3njGu5cUWfgwOqNoTsszp"
+ "uPvKRP6qaUR5VdoBkJUCFIefd7edlNK5mv6JYWaGdwxehg65hTkTmjZoPKxTZo4PLyzbL9U4"
+ "xt12ITSfeP2MfBHuLI2z2pDlBb44UQKVMx27LEoAHsdLp3WfWfgH3sdRBRCHm33UxCM4QmE2"
+ "xJ7gqSvNwTeH7v9GlC8zWbGroyD3UVNeShMLx29O7tH1biemLULwAHyIw8zdtLMDpEJ8m2ic"
+ "l6Lb4fDuuFNAs1GCVUthjK8CV8SWI8Rsz5THSwn5CGhpqUwSZcFknjwWIl5rNCvDxXJqYr");
+ // Note that "1EeRk" is common for "Rocks"
+ EXPECT_EQ(
+ Hash64TestDescriptor("Rocks", 430),
+ "c1EeRkrzgOYWLA8PuhJrwTePJewoB44WdXYDfhbk3ZxTqqg25WlPExDl7IKIQLJvnA6gJxxn"
+ "9TCSLkFGfJeXehaSS1GBqWSzfhEH4VXiXIUCuxJXxtKXcSC6FrNIQGTZbYDiUOLD6Y5inzrF"
+ "9etwQhXUBanw55xAUdNMFQAm2GjJ6UDWp2mISLiMMkLjANWMKLaZMqaFLX37qB4MRO1ooVRv"
+ "zSvaNRSCLxlggQCasQq8icWjzf3HjBlZtU6pd4rkaUxSzHqmo9oM5MghbU5Rtxg8wEfO7lVN"
+ "5wdMONYecslQTwjZUpO1K3LDf3K3XK6sUXM6ShQQ3RHmMn2acB4YtTZ3QQcHYJSOHn2DuWpa"
+ "Q8RqzX5lab92YmOLaCdOHq1BPsM7SIBzMdLgePNsJ1vvMALxAaoDUHPxoFLO2wx18IXnyX");
+ EXPECT_EQ(
+ Hash64TestDescriptor("RocksDB", 430),
+ "c1EeRkukbkb28wLTahwD2sfUhZzaBEnF8SVrxnPVB6A7b8CaAl3UKsDZISF92GSq2wDCukOq"
+ "Jgrsp7A3KZhDiLW8dFXp8UPqPxMCRlMdZeVeJ2dJxrmA6cyt99zkQFj7ELbut6jAeVqARFnw"
+ "fnWVXOsaLrq7bDCbMcns2DKvTaaqTCLMYxI7nhtLpFN1jR755FRQFcOzrrDbh7QhypjdvlYw"
+ "cdAMSZgp9JMHxbM23wPSuH6BOFgxejz35PScZfhDPvTOxIy1jc3MZsWrMC3P324zNolO7JdW"
+ "CX2I5UDKjjaEJfxbgVgJIXxtQGlmj2xkO5sPpjULQV4X2HlY7FQleJ4QRaJIB4buhCA4vUTF"
+ "eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm");
+}
+
+TEST(Fastrange32Test, Values) {
+ using ROCKSDB_NAMESPACE::fastrange32;
+ // Zero range
+ EXPECT_EQ(fastrange32(0, 0), 0U);
+ EXPECT_EQ(fastrange32(123, 0), 0U);
+ EXPECT_EQ(fastrange32(0xffffffff, 0), 0U);
+
+ // One range
+ EXPECT_EQ(fastrange32(0, 1), 0U);
+ EXPECT_EQ(fastrange32(123, 1), 0U);
+ EXPECT_EQ(fastrange32(0xffffffff, 1), 0U);
+
+ // Two range
+ EXPECT_EQ(fastrange32(0, 2), 0U);
+ EXPECT_EQ(fastrange32(123, 2), 0U);
+ EXPECT_EQ(fastrange32(0x7fffffff, 2), 0U);
+ EXPECT_EQ(fastrange32(0x80000000, 2), 1U);
+ EXPECT_EQ(fastrange32(0xffffffff, 2), 1U);
+
+ // Seven range
+ EXPECT_EQ(fastrange32(0, 7), 0U);
+ EXPECT_EQ(fastrange32(123, 7), 0U);
+ EXPECT_EQ(fastrange32(613566756, 7), 0U);
+ EXPECT_EQ(fastrange32(613566757, 7), 1U);
+ EXPECT_EQ(fastrange32(1227133513, 7), 1U);
+ EXPECT_EQ(fastrange32(1227133514, 7), 2U);
+ // etc.
+ EXPECT_EQ(fastrange32(0xffffffff, 7), 6U);
+
+ // Big
+ EXPECT_EQ(fastrange32(1, 0x80000000), 0U);
+ EXPECT_EQ(fastrange32(2, 0x80000000), 1U);
+ EXPECT_EQ(fastrange32(4, 0x7fffffff), 1U);
+ EXPECT_EQ(fastrange32(4, 0x80000000), 2U);
+ EXPECT_EQ(fastrange32(0xffffffff, 0x7fffffff), 0x7ffffffeU);
+ EXPECT_EQ(fastrange32(0xffffffff, 0x80000000), 0x7fffffffU);
+}
+
+TEST(Fastrange64Test, Values) {
+ using ROCKSDB_NAMESPACE::fastrange64;
+ // Zero range
+ EXPECT_EQ(fastrange64(0, 0), 0U);
+ EXPECT_EQ(fastrange64(123, 0), 0U);
+ EXPECT_EQ(fastrange64(0xffffFFFF, 0), 0U);
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0), 0U);
+
+ // One range
+ EXPECT_EQ(fastrange64(0, 1), 0U);
+ EXPECT_EQ(fastrange64(123, 1), 0U);
+ EXPECT_EQ(fastrange64(0xffffFFFF, 1), 0U);
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 1), 0U);
+
+ // Two range
+ EXPECT_EQ(fastrange64(0, 2), 0U);
+ EXPECT_EQ(fastrange64(123, 2), 0U);
+ EXPECT_EQ(fastrange64(0xffffFFFF, 2), 0U);
+ EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 2), 0U);
+ EXPECT_EQ(fastrange64(0x8000000000000000, 2), 1U);
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 2), 1U);
+
+ // Seven range
+ EXPECT_EQ(fastrange64(0, 7), 0U);
+ EXPECT_EQ(fastrange64(123, 7), 0U);
+ EXPECT_EQ(fastrange64(0xffffFFFF, 7), 0U);
+ EXPECT_EQ(fastrange64(2635249153387078802, 7), 0U);
+ EXPECT_EQ(fastrange64(2635249153387078803, 7), 1U);
+ EXPECT_EQ(fastrange64(5270498306774157604, 7), 1U);
+ EXPECT_EQ(fastrange64(5270498306774157605, 7), 2U);
+ EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 7), 3U);
+ EXPECT_EQ(fastrange64(0x8000000000000000, 7), 3U);
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 7), 6U);
+
+ // Big but 32-bit range
+ EXPECT_EQ(fastrange64(0x100000000, 0x80000000), 0U);
+ EXPECT_EQ(fastrange64(0x200000000, 0x80000000), 1U);
+ EXPECT_EQ(fastrange64(0x400000000, 0x7fffFFFF), 1U);
+ EXPECT_EQ(fastrange64(0x400000000, 0x80000000), 2U);
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU);
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU);
+
+ // Big, > 32-bit range
+#if SIZE_MAX == UINT64_MAX
+ EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U);
+ EXPECT_EQ(fastrange64(0x8000000000000000, 0x4200000002), 0x2100000001U);
+
+ EXPECT_EQ(fastrange64(0x0000000000000000, 420000000002), 0U);
+ EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U);
+ EXPECT_EQ(fastrange64(0x8000000000000000, 420000000002), 210000000001U);
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U);
+
+ EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF),
+ 0xffffFFFFffffFFFEU);
+#endif
+}
+
+// for inspection of disassembly
+uint32_t fastrange32(uint32_t hash, uint32_t range) {
+ return ROCKSDB_NAMESPACE::fastrange32(hash, range);
+}
+
+// for inspection of disassembly
+size_t fastrange64(uint64_t hash, size_t range) {
+ return ROCKSDB_NAMESPACE::fastrange64(hash, range);
+}
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/heap.h b/src/rocksdb/util/heap.h
new file mode 100644
index 000000000..c72d944d3
--- /dev/null
+++ b/src/rocksdb/util/heap.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Binary heap implementation optimized for use in multi-way merge sort.
+// Comparison to std::priority_queue:
+// - In libstdc++, std::priority_queue::pop() usually performs just over logN
+// comparisons but never fewer.
+// - std::priority_queue does not have a replace-top operation, requiring a
+// pop+push. If the replacement element is the new top, this requires
+// around 2logN comparisons.
+// - This heap's pop() uses a "schoolbook" downheap which requires up to ~2logN
+// comparisons.
+// - This heap provides a replace_top() operation which requires [1, 2logN]
+// comparisons. When the replacement element is also the new top, this
+// takes just 1 or 2 comparisons.
+//
+// The last property can yield an order-of-magnitude performance improvement
+// when merge-sorting real-world non-random data. If the merge operation is
+// likely to take chunks of elements from the same input stream, only 1
+// comparison per element is needed. In RocksDB-land, this happens when
+// compacting a database where keys are not randomly distributed across L0
+// files but nearby keys are likely to be in the same L0 file.
+//
+// The container uses the same counterintuitive ordering as
+// std::priority_queue: the comparison operator is expected to provide the
+// less-than relation, but top() will return the maximum.
+
+template<typename T, typename Compare = std::less<T>>
+class BinaryHeap {
+ public:
+ BinaryHeap() { }
+ explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) { }
+
+ void push(const T& value) {
+ data_.push_back(value);
+ upheap(data_.size() - 1);
+ }
+
+ void push(T&& value) {
+ data_.push_back(std::move(value));
+ upheap(data_.size() - 1);
+ }
+
+ const T& top() const {
+ assert(!empty());
+ return data_.front();
+ }
+
+ void replace_top(const T& value) {
+ assert(!empty());
+ data_.front() = value;
+ downheap(get_root());
+ }
+
+ void replace_top(T&& value) {
+ assert(!empty());
+ data_.front() = std::move(value);
+ downheap(get_root());
+ }
+
+ void pop() {
+ assert(!empty());
+ data_.front() = std::move(data_.back());
+ data_.pop_back();
+ if (!empty()) {
+ downheap(get_root());
+ } else {
+ reset_root_cmp_cache();
+ }
+ }
+
+ void swap(BinaryHeap &other) {
+ std::swap(cmp_, other.cmp_);
+ data_.swap(other.data_);
+ std::swap(root_cmp_cache_, other.root_cmp_cache_);
+ }
+
+ void clear() {
+ data_.clear();
+ reset_root_cmp_cache();
+ }
+
+ bool empty() const { return data_.empty(); }
+
+ size_t size() const { return data_.size(); }
+
+ void reset_root_cmp_cache() { root_cmp_cache_ = port::kMaxSizet; }
+
+ private:
+ static inline size_t get_root() { return 0; }
+ static inline size_t get_parent(size_t index) { return (index - 1) / 2; }
+ static inline size_t get_left(size_t index) { return 2 * index + 1; }
+ static inline size_t get_right(size_t index) { return 2 * index + 2; }
+
+ void upheap(size_t index) {
+ T v = std::move(data_[index]);
+ while (index > get_root()) {
+ const size_t parent = get_parent(index);
+ if (!cmp_(data_[parent], v)) {
+ break;
+ }
+ data_[index] = std::move(data_[parent]);
+ index = parent;
+ }
+ data_[index] = std::move(v);
+ reset_root_cmp_cache();
+ }
+
+ void downheap(size_t index) {
+ T v = std::move(data_[index]);
+
+ size_t picked_child = port::kMaxSizet;
+ while (1) {
+ const size_t left_child = get_left(index);
+ if (get_left(index) >= data_.size()) {
+ break;
+ }
+ const size_t right_child = left_child + 1;
+ assert(right_child == get_right(index));
+ picked_child = left_child;
+ if (index == 0 && root_cmp_cache_ < data_.size()) {
+ picked_child = root_cmp_cache_;
+ } else if (right_child < data_.size() &&
+ cmp_(data_[left_child], data_[right_child])) {
+ picked_child = right_child;
+ }
+ if (!cmp_(v, data_[picked_child])) {
+ break;
+ }
+ data_[index] = std::move(data_[picked_child]);
+ index = picked_child;
+ }
+
+ if (index == 0) {
+ // We did not change anything in the tree except for the value
+ // of the root node, left and right child did not change, we can
+ // cache that `picked_child` is the smallest child
+ // so next time we compare againist it directly
+ root_cmp_cache_ = picked_child;
+ } else {
+ // the tree changed, reset cache
+ reset_root_cmp_cache();
+ }
+
+ data_[index] = std::move(v);
+ }
+
+ Compare cmp_;
+ autovector<T> data_;
+ // Used to reduce number of cmp_ calls in downheap()
+ size_t root_cmp_cache_ = port::kMaxSizet;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/heap_test.cc b/src/rocksdb/util/heap_test.cc
new file mode 100644
index 000000000..1f188e73b
--- /dev/null
+++ b/src/rocksdb/util/heap_test.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <gtest/gtest.h>
+
+#include <climits>
+
+#include <queue>
+#include <random>
+#include <utility>
+
+#include "util/heap.h"
+
+#ifndef GFLAGS
+const int64_t FLAGS_iters = 100000;
+#else
+#include "util/gflags_compat.h"
+DEFINE_int64(iters, 100000, "number of pseudo-random operations in each test");
+#endif // GFLAGS
+
+/*
+ * Compares the custom heap implementation in util/heap.h against
+ * std::priority_queue on a pseudo-random sequence of operations.
+ */
+
+namespace ROCKSDB_NAMESPACE {
+
+using HeapTestValue = uint64_t;
+using Params = std::tuple<size_t, HeapTestValue, int64_t>;
+
+class HeapTest : public ::testing::TestWithParam<Params> {
+};
+
+TEST_P(HeapTest, Test) {
+ // This test performs the same pseudorandom sequence of operations on a
+ // BinaryHeap and an std::priority_queue, comparing output. The three
+ // possible operations are insert, replace top and pop.
+ //
+ // Insert is chosen slightly more often than the others so that the size of
+ // the heap slowly grows. Once the size heats the MAX_HEAP_SIZE limit, we
+ // disallow inserting until the heap becomes empty, testing the "draining"
+ // scenario.
+
+ const auto MAX_HEAP_SIZE = std::get<0>(GetParam());
+ const auto MAX_VALUE = std::get<1>(GetParam());
+ const auto RNG_SEED = std::get<2>(GetParam());
+
+ BinaryHeap<HeapTestValue> heap;
+ std::priority_queue<HeapTestValue> ref;
+
+ std::mt19937 rng(static_cast<unsigned int>(RNG_SEED));
+ std::uniform_int_distribution<HeapTestValue> value_dist(0, MAX_VALUE);
+ int ndrains = 0;
+ bool draining = false; // hit max size, draining until we empty the heap
+ size_t size = 0;
+ for (int64_t i = 0; i < FLAGS_iters; ++i) {
+ if (size == 0) {
+ draining = false;
+ }
+
+ if (!draining &&
+ (size == 0 || std::bernoulli_distribution(0.4)(rng))) {
+ // insert
+ HeapTestValue val = value_dist(rng);
+ heap.push(val);
+ ref.push(val);
+ ++size;
+ if (size == MAX_HEAP_SIZE) {
+ draining = true;
+ ++ndrains;
+ }
+ } else if (std::bernoulli_distribution(0.5)(rng)) {
+ // replace top
+ HeapTestValue val = value_dist(rng);
+ heap.replace_top(val);
+ ref.pop();
+ ref.push(val);
+ } else {
+ // pop
+ assert(size > 0);
+ heap.pop();
+ ref.pop();
+ --size;
+ }
+
+ // After every operation, check that the public methods give the same
+ // results
+ assert((size == 0) == ref.empty());
+ ASSERT_EQ(size == 0, heap.empty());
+ if (size > 0) {
+ ASSERT_EQ(ref.top(), heap.top());
+ }
+ }
+
+ // Probabilities should be set up to occasionally hit the max heap size and
+ // drain it
+ assert(ndrains > 0);
+
+ heap.clear();
+ ASSERT_TRUE(heap.empty());
+}
+
+// Basic test, MAX_VALUE = 3*MAX_HEAP_SIZE (occasional duplicates)
+INSTANTIATE_TEST_CASE_P(
+ Basic, HeapTest,
+ ::testing::Values(Params(1000, 3000, 0x1b575cf05b708945))
+);
+// Mid-size heap with small values (many duplicates)
+INSTANTIATE_TEST_CASE_P(
+ SmallValues, HeapTest,
+ ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0))
+);
+// Small heap, large value range (no duplicates)
+INSTANTIATE_TEST_CASE_P(
+ SmallHeap, HeapTest,
+ ::testing::Values(Params(10, ULLONG_MAX, 0x3e1fa8f4d01707cf))
+);
+// Two-element heap
+INSTANTIATE_TEST_CASE_P(
+ TwoElementHeap, HeapTest,
+ ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc))
+);
+// One-element heap
+INSTANTIATE_TEST_CASE_P(
+ OneElementHeap, HeapTest,
+ ::testing::Values(Params(1, 3, 0x176a1019ab0b612e))
+);
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+ GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+#endif // GFLAGS
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/kv_map.h b/src/rocksdb/util/kv_map.h
new file mode 100644
index 000000000..0f713ccea
--- /dev/null
+++ b/src/rocksdb/util/kv_map.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace stl_wrappers {
+
+struct LessOfComparator {
+ explicit LessOfComparator(const Comparator* c = BytewiseComparator())
+ : cmp(c) {}
+
+ bool operator()(const std::string& a, const std::string& b) const {
+ return cmp->Compare(Slice(a), Slice(b)) < 0;
+ }
+ bool operator()(const Slice& a, const Slice& b) const {
+ return cmp->Compare(a, b) < 0;
+ }
+
+ const Comparator* cmp;
+};
+
+typedef std::map<std::string, std::string, LessOfComparator> KVMap;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/log_write_bench.cc b/src/rocksdb/util/log_write_bench.cc
new file mode 100644
index 000000000..60798babf
--- /dev/null
+++ b/src/rocksdb/util/log_write_bench.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+
+#include "file/writable_file_writer.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+// A simple benchmark to simulate transactional logs
+
+DEFINE_int32(num_records, 6000, "Number of records.");
+DEFINE_int32(record_size, 249, "Size of each record.");
+DEFINE_int32(record_interval, 10000, "Interval between records (microSec)");
+DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions");
+DEFINE_bool(enable_sync, false, "sync after each write.");
+
+namespace ROCKSDB_NAMESPACE {
+void RunBenchmark() {
+ std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
+ DBOptions options;
+ Env* env = Env::Default();
+ EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options);
+ env_options.bytes_per_sync = FLAGS_bytes_per_sync;
+ std::unique_ptr<WritableFile> file;
+ env->NewWritableFile(file_name, &file, env_options);
+ std::unique_ptr<WritableFileWriter> writer;
+ writer.reset(new WritableFileWriter(std::move(file), file_name, env_options,
+ env, nullptr /* stats */,
+ options.listeners));
+
+ std::string record;
+ record.assign(FLAGS_record_size, 'X');
+
+ HistogramImpl hist;
+
+ uint64_t start_time = env->NowMicros();
+ for (int i = 0; i < FLAGS_num_records; i++) {
+ uint64_t start_nanos = env->NowNanos();
+ writer->Append(record);
+ writer->Flush();
+ if (FLAGS_enable_sync) {
+ writer->Sync(false);
+ }
+ hist.Add(env->NowNanos() - start_nanos);
+
+ if (i % 1000 == 1) {
+ fprintf(stderr, "Wrote %d records...\n", i);
+ }
+
+ int time_to_sleep =
+ (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time);
+ if (time_to_sleep > 0) {
+ env->SleepForMicroseconds(time_to_sleep);
+ }
+ }
+
+ fprintf(stderr, "Distribution of latency of append+flush: \n%s",
+ hist.ToString().c_str());
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+ " [OPTIONS]...");
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ ROCKSDB_NAMESPACE::RunBenchmark();
+ return 0;
+}
+
+#endif // GFLAGS
diff --git a/src/rocksdb/util/murmurhash.cc b/src/rocksdb/util/murmurhash.cc
new file mode 100644
index 000000000..3b759c5e6
--- /dev/null
+++ b/src/rocksdb/util/murmurhash.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+/*
+ Murmurhash from http://sites.google.com/site/murmurhash/
+
+ All code is released to the public domain. For business purposes, Murmurhash
+ is under the MIT license.
+*/
+#include "murmurhash.h"
+#include "util/util.h"
+
+#if defined(__x86_64__)
+
+// -------------------------------------------------------------------
+//
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+//
+// 64-bit hash for 64-bit platforms
+
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
+{
+ const uint64_t m = 0xc6a4a7935bd1e995;
+ const int r = 47;
+
+ uint64_t h = seed ^ (len * m);
+
+ const uint64_t * data = (const uint64_t *)key;
+ const uint64_t * end = data + (len/8);
+
+ while(data != end)
+ {
+ uint64_t k = *data++;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h ^= k;
+ h *= m;
+ }
+
+ const unsigned char * data2 = (const unsigned char*)data;
+
+ switch(len & 7)
+ {
+ case 7: h ^= ((uint64_t)data2[6]) << 48; FALLTHROUGH_INTENDED;
+ case 6: h ^= ((uint64_t)data2[5]) << 40; FALLTHROUGH_INTENDED;
+ case 5: h ^= ((uint64_t)data2[4]) << 32; FALLTHROUGH_INTENDED;
+ case 4: h ^= ((uint64_t)data2[3]) << 24; FALLTHROUGH_INTENDED;
+ case 3: h ^= ((uint64_t)data2[2]) << 16; FALLTHROUGH_INTENDED;
+ case 2: h ^= ((uint64_t)data2[1]) << 8; FALLTHROUGH_INTENDED;
+ case 1: h ^= ((uint64_t)data2[0]);
+ h *= m;
+ };
+
+ h ^= h >> r;
+ h *= m;
+ h ^= h >> r;
+
+ return h;
+}
+
+#elif defined(__i386__)
+
+// -------------------------------------------------------------------
+//
+// Note - This code makes a few assumptions about how your machine behaves -
+//
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+//
+// And it has a few limitations -
+//
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+// machines.
+
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
+{
+ // 'm' and 'r' are mixing constants generated offline.
+ // They're not really 'magic', they just happen to work well.
+
+ const unsigned int m = 0x5bd1e995;
+ const int r = 24;
+
+ // Initialize the hash to a 'random' value
+
+ unsigned int h = seed ^ len;
+
+ // Mix 4 bytes at a time into the hash
+
+ const unsigned char * data = (const unsigned char *)key;
+
+ while(len >= 4)
+ {
+ unsigned int k = *(unsigned int *)data;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h *= m;
+ h ^= k;
+
+ data += 4;
+ len -= 4;
+ }
+
+ // Handle the last few bytes of the input array
+
+ switch(len)
+ {
+ case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+ case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED;
+ case 1: h ^= data[0];
+ h *= m;
+ };
+
+ // Do a few final mixes of the hash to ensure the last few
+ // bytes are well-incorporated.
+
+ h ^= h >> 13;
+ h *= m;
+ h ^= h >> 15;
+
+ return h;
+}
+
+#else
+
+// -------------------------------------------------------------------
+//
+// Same as MurmurHash2, but endian- and alignment-neutral.
+// Half the speed though, alas.
+
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed )
+{
+ const unsigned int m = 0x5bd1e995;
+ const int r = 24;
+
+ unsigned int h = seed ^ len;
+
+ const unsigned char * data = (const unsigned char *)key;
+
+ while(len >= 4)
+ {
+ unsigned int k;
+
+ k = data[0];
+ k |= data[1] << 8;
+ k |= data[2] << 16;
+ k |= data[3] << 24;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h *= m;
+ h ^= k;
+
+ data += 4;
+ len -= 4;
+ }
+
+ switch(len)
+ {
+ case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+ case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED;
+ case 1: h ^= data[0];
+ h *= m;
+ };
+
+ h ^= h >> 13;
+ h *= m;
+ h ^= h >> 15;
+
+ return h;
+}
+
+#endif
diff --git a/src/rocksdb/util/murmurhash.h b/src/rocksdb/util/murmurhash.h
new file mode 100644
index 000000000..1dbb57398
--- /dev/null
+++ b/src/rocksdb/util/murmurhash.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+/*
+ Murmurhash from http://sites.google.com/site/murmurhash/
+
+ All code is released to the public domain. For business purposes, Murmurhash
+ is under the MIT license.
+*/
+#pragma once
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+#if defined(__x86_64__)
+#define MURMUR_HASH MurmurHash64A
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash64A
+typedef uint64_t murmur_t;
+
+#elif defined(__i386__)
+#define MURMUR_HASH MurmurHash2
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash2
+typedef unsigned int murmur_t;
+
+#else
+#define MURMUR_HASH MurmurHashNeutral2
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHashNeutral2
+typedef unsigned int murmur_t;
+#endif
+
+// Allow slice to be hashable by murmur hash.
+namespace ROCKSDB_NAMESPACE {
+struct murmur_hash {
+ size_t operator()(const Slice& slice) const {
+ return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0);
+ }
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/mutexlock.h b/src/rocksdb/util/mutexlock.h
new file mode 100644
index 000000000..91ba4fda7
--- /dev/null
+++ b/src/rocksdb/util/mutexlock.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <assert.h>
+#include <atomic>
+#include <mutex>
+#include <thread>
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper class that locks a mutex on construction and unlocks the mutex when
+// the destructor of the MutexLock object is invoked.
+//
+// Typical usage:
+//
+// void MyClass::MyMethod() {
+// MutexLock l(&mu_); // mu_ is an instance variable
+// ... some complex code, possibly with multiple return paths ...
+// }
+
+class MutexLock {
+ public:
+ explicit MutexLock(port::Mutex *mu) : mu_(mu) {
+ this->mu_->Lock();
+ }
+ // No copying allowed
+ MutexLock(const MutexLock &) = delete;
+ void operator=(const MutexLock &) = delete;
+
+ ~MutexLock() { this->mu_->Unlock(); }
+
+ private:
+ port::Mutex *const mu_;
+};
+
+//
+// Acquire a ReadLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class ReadLock {
+ public:
+ explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
+ this->mu_->ReadLock();
+ }
+ // No copying allowed
+ ReadLock(const ReadLock &) = delete;
+ void operator=(const ReadLock &) = delete;
+
+ ~ReadLock() { this->mu_->ReadUnlock(); }
+
+ private:
+ port::RWMutex *const mu_;
+};
+
+//
+// Automatically unlock a locked mutex when the object is destroyed
+//
+class ReadUnlock {
+ public:
+ explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); }
+ // No copying allowed
+ ReadUnlock(const ReadUnlock &) = delete;
+ ReadUnlock &operator=(const ReadUnlock &) = delete;
+
+ ~ReadUnlock() { mu_->ReadUnlock(); }
+
+ private:
+ port::RWMutex *const mu_;
+};
+
+//
+// Acquire a WriteLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class WriteLock {
+ public:
+ explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
+ this->mu_->WriteLock();
+ }
+ // No copying allowed
+ WriteLock(const WriteLock &) = delete;
+ void operator=(const WriteLock &) = delete;
+
+ ~WriteLock() { this->mu_->WriteUnlock(); }
+
+ private:
+ port::RWMutex *const mu_;
+};
+
+//
+// SpinMutex has very low overhead for low-contention cases. Method names
+// are chosen so you can use std::unique_lock or std::lock_guard with it.
+//
+class SpinMutex {
+ public:
+ SpinMutex() : locked_(false) {}
+
+ bool try_lock() {
+ auto currently_locked = locked_.load(std::memory_order_relaxed);
+ return !currently_locked &&
+ locked_.compare_exchange_weak(currently_locked, true,
+ std::memory_order_acquire,
+ std::memory_order_relaxed);
+ }
+
+ void lock() {
+ for (size_t tries = 0;; ++tries) {
+ if (try_lock()) {
+ // success
+ break;
+ }
+ port::AsmVolatilePause();
+ if (tries > 100) {
+ std::this_thread::yield();
+ }
+ }
+ }
+
+ void unlock() { locked_.store(false, std::memory_order_release); }
+
+ private:
+ std::atomic<bool> locked_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/ppc-opcode.h b/src/rocksdb/util/ppc-opcode.h
new file mode 100644
index 000000000..5cc5af0e3
--- /dev/null
+++ b/src/rocksdb/util/ppc-opcode.h
@@ -0,0 +1,27 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2017 International Business Machines Corp.
+// All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#define __PPC_RA(a) (((a)&0x1f) << 16)
+#define __PPC_RB(b) (((b)&0x1f) << 11)
+#define __PPC_XA(a) ((((a)&0x1f) << 16) | (((a)&0x20) >> 3))
+#define __PPC_XB(b) ((((b)&0x1f) << 11) | (((b)&0x20) >> 4))
+#define __PPC_XS(s) ((((s)&0x1f) << 21) | (((s)&0x20) >> 5))
+#define __PPC_XT(s) __PPC_XS(s)
+#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW 0x10000488
+#define PPC_INST_VPMSUMD 0x100004c8
+#define PPC_INST_MFVSRD 0x7c000066
+#define PPC_INST_MTVSRD 0x7c000166
+
+#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0)
+#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0)
diff --git a/src/rocksdb/util/random.cc b/src/rocksdb/util/random.cc
new file mode 100644
index 000000000..38c36defd
--- /dev/null
+++ b/src/rocksdb/util/random.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "util/random.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <thread>
+#include <utility>
+
+#include "port/likely.h"
+#include "util/thread_local.h"
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+#define STORAGE_DECL static __thread
+#else
+#define STORAGE_DECL static
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+Random* Random::GetTLSInstance() {
+ STORAGE_DECL Random* tls_instance;
+ STORAGE_DECL std::aligned_storage<sizeof(Random)>::type tls_instance_bytes;
+
+ auto rv = tls_instance;
+ if (UNLIKELY(rv == nullptr)) {
+ size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+ rv = new (&tls_instance_bytes) Random((uint32_t)seed);
+ tls_instance = rv;
+ }
+ return rv;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/random.h b/src/rocksdb/util/random.h
new file mode 100644
index 000000000..f97b2126f
--- /dev/null
+++ b/src/rocksdb/util/random.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <random>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A very simple random number generator. Not especially good at
+// generating truly random bits, but good enough for our needs in this
+// package.
+class Random {
+ private:
+ enum : uint32_t {
+ M = 2147483647L // 2^31-1
+ };
+ enum : uint64_t {
+ A = 16807 // bits 14, 8, 7, 5, 2, 1, 0
+ };
+
+ uint32_t seed_;
+
+ static uint32_t GoodSeed(uint32_t s) { return (s & M) != 0 ? (s & M) : 1; }
+
+ public:
+ // This is the largest value that can be returned from Next()
+ enum : uint32_t { kMaxNext = M };
+
+ explicit Random(uint32_t s) : seed_(GoodSeed(s)) {}
+
+ void Reset(uint32_t s) { seed_ = GoodSeed(s); }
+
+ uint32_t Next() {
+ // We are computing
+ // seed_ = (seed_ * A) % M, where M = 2^31-1
+ //
+ // seed_ must not be zero or M, or else all subsequent computed values
+ // will be zero or M respectively. For all other values, seed_ will end
+ // up cycling through every number in [1,M-1]
+ uint64_t product = seed_ * A;
+
+ // Compute (product % M) using the fact that ((x << 31) % M) == x.
+ seed_ = static_cast<uint32_t>((product >> 31) + (product & M));
+ // The first reduction may overflow by 1 bit, so we may need to
+ // repeat. mod == M is not possible; using > allows the faster
+ // sign-bit-based test.
+ if (seed_ > M) {
+ seed_ -= M;
+ }
+ return seed_;
+ }
+
+ // Returns a uniformly distributed value in the range [0..n-1]
+ // REQUIRES: n > 0
+ uint32_t Uniform(int n) { return Next() % n; }
+
+ // Randomly returns true ~"1/n" of the time, and false otherwise.
+ // REQUIRES: n > 0
+ bool OneIn(int n) { return Uniform(n) == 0; }
+
+ // "Optional" one-in-n, where 0 or negative always returns false
+ // (may or may not consume a random value)
+ bool OneInOpt(int n) { return n > 0 && OneIn(n); }
+
+ // Returns random bool that is true for the given percentage of
+ // calls on average. Zero or less is always false and 100 or more
+ // is always true (may or may not consume a random value)
+ bool PercentTrue(int percentage) {
+ return static_cast<int>(Uniform(100)) < percentage;
+ }
+
+ // Skewed: pick "base" uniformly from range [0,max_log] and then
+ // return "base" random bits. The effect is to pick a number in the
+ // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+ uint32_t Skewed(int max_log) {
+ return Uniform(1 << Uniform(max_log + 1));
+ }
+
+ // Returns a Random instance for use by the current thread without
+ // additional locking
+ static Random* GetTLSInstance();
+};
+
+// A good 32-bit random number generator based on std::mt19937.
+// This exists in part to avoid compiler variance in warning about coercing
+// uint_fast32_t from mt19937 to uint32_t.
+class Random32 {
+ private:
+ std::mt19937 generator_;
+
+ public:
+ explicit Random32(uint32_t s) : generator_(s) {}
+
+ // Generates the next random number
+ uint32_t Next() { return static_cast<uint32_t>(generator_()); }
+
+ // Returns a uniformly distributed value in the range [0..n-1]
+ // REQUIRES: n > 0
+ uint32_t Uniform(uint32_t n) {
+ return static_cast<uint32_t>(
+ std::uniform_int_distribution<std::mt19937::result_type>(
+ 0, n - 1)(generator_));
+ }
+
+ // Returns an *almost* uniformly distributed value in the range [0..n-1].
+ // Much faster than Uniform().
+ // REQUIRES: n > 0
+ uint32_t Uniformish(uint32_t n) {
+ // fastrange (without the header)
+ return static_cast<uint32_t>((uint64_t(generator_()) * uint64_t(n)) >> 32);
+ }
+
+ // Randomly returns true ~"1/n" of the time, and false otherwise.
+ // REQUIRES: n > 0
+ bool OneIn(uint32_t n) { return Uniform(n) == 0; }
+
+ // Skewed: pick "base" uniformly from range [0,max_log] and then
+ // return "base" random bits. The effect is to pick a number in the
+ // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+ uint32_t Skewed(int max_log) {
+ return Uniform(uint32_t{1} << Uniform(max_log + 1));
+ }
+
+ // Reset the seed of the generator to the given value
+ void Seed(uint32_t new_seed) { generator_.seed(new_seed); }
+};
+
+// A good 64-bit random number generator based on std::mt19937_64
+class Random64 {
+ private:
+ std::mt19937_64 generator_;
+
+ public:
+ explicit Random64(uint64_t s) : generator_(s) { }
+
+ // Generates the next random number
+ uint64_t Next() { return generator_(); }
+
+ // Returns a uniformly distributed value in the range [0..n-1]
+ // REQUIRES: n > 0
+ uint64_t Uniform(uint64_t n) {
+ return std::uniform_int_distribution<uint64_t>(0, n - 1)(generator_);
+ }
+
+ // Randomly returns true ~"1/n" of the time, and false otherwise.
+ // REQUIRES: n > 0
+ bool OneIn(uint64_t n) { return Uniform(n) == 0; }
+
+ // Skewed: pick "base" uniformly from range [0,max_log] and then
+ // return "base" random bits. The effect is to pick a number in the
+ // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+ uint64_t Skewed(int max_log) {
+ return Uniform(uint64_t(1) << Uniform(max_log + 1));
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/random_test.cc b/src/rocksdb/util/random_test.cc
new file mode 100644
index 000000000..ad5208193
--- /dev/null
+++ b/src/rocksdb/util/random_test.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+#include <vector>
+
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+using ROCKSDB_NAMESPACE::Random;
+
+TEST(RandomTest, Uniform) {
+ const int average = 20;
+ for (uint32_t seed : {0, 1, 2, 37, 4096}) {
+ Random r(seed);
+ for (int range : {1, 2, 8, 12, 100}) {
+ std::vector<int> counts(range, 0);
+
+ for (int i = 0; i < range * average; ++i) {
+ ++counts.at(r.Uniform(range));
+ }
+ int max_variance = static_cast<int>(std::sqrt(range) * 2 + 4);
+ for (int i = 0; i < range; ++i) {
+ EXPECT_GE(counts[i], std::max(1, average - max_variance));
+ EXPECT_LE(counts[i], average + max_variance + 1);
+ }
+ }
+ }
+}
+
+TEST(RandomTest, OneIn) {
+ Random r(42);
+ for (int range : {1, 2, 8, 12, 100, 1234}) {
+ const int average = 100;
+ int count = 0;
+ for (int i = 0; i < average * range; ++i) {
+ if (r.OneIn(range)) {
+ ++count;
+ }
+ }
+ if (range == 1) {
+ EXPECT_EQ(count, average);
+ } else {
+ int max_variance = static_cast<int>(std::sqrt(average) * 1.5);
+ EXPECT_GE(count, average - max_variance);
+ EXPECT_LE(count, average + max_variance);
+ }
+ }
+}
+
+TEST(RandomTest, OneInOpt) {
+ Random r(42);
+ for (int range : {-12, 0, 1, 2, 8, 12, 100, 1234}) {
+ const int average = 100;
+ int count = 0;
+ for (int i = 0; i < average * range; ++i) {
+ if (r.OneInOpt(range)) {
+ ++count;
+ }
+ }
+ if (range < 1) {
+ EXPECT_EQ(count, 0);
+ } else if (range == 1) {
+ EXPECT_EQ(count, average);
+ } else {
+ int max_variance = static_cast<int>(std::sqrt(average) * 1.5);
+ EXPECT_GE(count, average - max_variance);
+ EXPECT_LE(count, average + max_variance);
+ }
+ }
+}
+
+TEST(RandomTest, PercentTrue) {
+ Random r(42);
+ for (int pct : {-12, 0, 1, 2, 10, 50, 90, 98, 99, 100, 1234}) {
+ const int samples = 10000;
+
+ int count = 0;
+ for (int i = 0; i < samples; ++i) {
+ if (r.PercentTrue(pct)) {
+ ++count;
+ }
+ }
+ if (pct <= 0) {
+ EXPECT_EQ(count, 0);
+ } else if (pct >= 100) {
+ EXPECT_EQ(count, samples);
+ } else {
+ int est = (count * 100 + (samples / 2)) / samples;
+ EXPECT_EQ(est, pct);
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/rate_limiter.cc b/src/rocksdb/util/rate_limiter.cc
new file mode 100644
index 000000000..b1eefe620
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/rate_limiter.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+size_t RateLimiter::RequestToken(size_t bytes, size_t alignment,
+ Env::IOPriority io_priority, Statistics* stats,
+ RateLimiter::OpType op_type) {
+ if (io_priority < Env::IO_TOTAL && IsRateLimited(op_type)) {
+ bytes = std::min(bytes, static_cast<size_t>(GetSingleBurstBytes()));
+
+ if (alignment > 0) {
+ // Here we may actually require more than burst and block
+ // but we can not write less than one page at a time on direct I/O
+ // thus we may want not to use ratelimiter
+ bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
+ }
+ Request(bytes, io_priority, stats, op_type);
+ }
+ return bytes;
+}
+
+// Pending request
+struct GenericRateLimiter::Req {
+ explicit Req(int64_t _bytes, port::Mutex* _mu)
+ : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {}
+ int64_t request_bytes;
+ int64_t bytes;
+ port::CondVar cv;
+ bool granted;
+};
+
+GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
+ int64_t refill_period_us,
+ int32_t fairness, RateLimiter::Mode mode,
+ Env* env, bool auto_tuned)
+ : RateLimiter(mode),
+ refill_period_us_(refill_period_us),
+ rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
+ : rate_bytes_per_sec),
+ refill_bytes_per_period_(
+ CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)),
+ env_(env),
+ stop_(false),
+ exit_cv_(&request_mutex_),
+ requests_to_wait_(0),
+ available_bytes_(0),
+ next_refill_us_(NowMicrosMonotonic(env_)),
+ fairness_(fairness > 100 ? 100 : fairness),
+ rnd_((uint32_t)time(nullptr)),
+ leader_(nullptr),
+ auto_tuned_(auto_tuned),
+ num_drains_(0),
+ prev_num_drains_(0),
+ max_bytes_per_sec_(rate_bytes_per_sec),
+ tuned_time_(NowMicrosMonotonic(env_)) {
+ total_requests_[0] = 0;
+ total_requests_[1] = 0;
+ total_bytes_through_[0] = 0;
+ total_bytes_through_[1] = 0;
+}
+
+GenericRateLimiter::~GenericRateLimiter() {
+ MutexLock g(&request_mutex_);
+ stop_ = true;
+ requests_to_wait_ = static_cast<int32_t>(queue_[Env::IO_LOW].size() +
+ queue_[Env::IO_HIGH].size());
+ for (auto& r : queue_[Env::IO_HIGH]) {
+ r->cv.Signal();
+ }
+ for (auto& r : queue_[Env::IO_LOW]) {
+ r->cv.Signal();
+ }
+ while (requests_to_wait_ > 0) {
+ exit_cv_.Wait();
+ }
+}
+
+// This API allows user to dynamically change rate limiter's bytes per second.
+void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) {
+ assert(bytes_per_second > 0);
+ rate_bytes_per_sec_ = bytes_per_second;
+ refill_bytes_per_period_.store(
+ CalculateRefillBytesPerPeriod(bytes_per_second),
+ std::memory_order_relaxed);
+}
+
+void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
+ Statistics* stats) {
+ assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed));
+ TEST_SYNC_POINT("GenericRateLimiter::Request");
+ TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1",
+ &rate_bytes_per_sec_);
+ MutexLock g(&request_mutex_);
+
+ if (auto_tuned_) {
+ static const int kRefillsPerTune = 100;
+ std::chrono::microseconds now(NowMicrosMonotonic(env_));
+ if (now - tuned_time_ >=
+ kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) {
+ Tune();
+ }
+ }
+
+ if (stop_) {
+ return;
+ }
+
+ ++total_requests_[pri];
+
+ if (available_bytes_ >= bytes) {
+ // Refill thread assigns quota and notifies requests waiting on
+ // the queue under mutex. So if we get here, that means nobody
+ // is waiting?
+ available_bytes_ -= bytes;
+ total_bytes_through_[pri] += bytes;
+ return;
+ }
+
+ // Request cannot be satisfied at this moment, enqueue
+ Req r(bytes, &request_mutex_);
+ queue_[pri].push_back(&r);
+
+ do {
+ bool timedout = false;
+ // Leader election, candidates can be:
+ // (1) a new incoming request,
+ // (2) a previous leader, whose quota has not been not assigned yet due
+ // to lower priority
+ // (3) a previous waiter at the front of queue, who got notified by
+ // previous leader
+ if (leader_ == nullptr &&
+ ((!queue_[Env::IO_HIGH].empty() &&
+ &r == queue_[Env::IO_HIGH].front()) ||
+ (!queue_[Env::IO_LOW].empty() &&
+ &r == queue_[Env::IO_LOW].front()))) {
+ leader_ = &r;
+ int64_t delta = next_refill_us_ - NowMicrosMonotonic(env_);
+ delta = delta > 0 ? delta : 0;
+ if (delta == 0) {
+ timedout = true;
+ } else {
+ int64_t wait_until = env_->NowMicros() + delta;
+ RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
+ ++num_drains_;
+ timedout = r.cv.TimedWait(wait_until);
+ }
+ } else {
+ // Not at the front of queue or an leader has already been elected
+ r.cv.Wait();
+ }
+
+ // request_mutex_ is held from now on
+ if (stop_) {
+ --requests_to_wait_;
+ exit_cv_.Signal();
+ return;
+ }
+
+ // Make sure the waken up request is always the header of its queue
+ assert(r.granted ||
+ (!queue_[Env::IO_HIGH].empty() &&
+ &r == queue_[Env::IO_HIGH].front()) ||
+ (!queue_[Env::IO_LOW].empty() &&
+ &r == queue_[Env::IO_LOW].front()));
+ assert(leader_ == nullptr ||
+ (!queue_[Env::IO_HIGH].empty() &&
+ leader_ == queue_[Env::IO_HIGH].front()) ||
+ (!queue_[Env::IO_LOW].empty() &&
+ leader_ == queue_[Env::IO_LOW].front()));
+
+ if (leader_ == &r) {
+ // Waken up from TimedWait()
+ if (timedout) {
+ // Time to do refill!
+ Refill();
+
+ // Re-elect a new leader regardless. This is to simplify the
+ // election handling.
+ leader_ = nullptr;
+
+ // Notify the header of queue if current leader is going away
+ if (r.granted) {
+ // Current leader already got granted with quota. Notify header
+ // of waiting queue to participate next round of election.
+ assert((queue_[Env::IO_HIGH].empty() ||
+ &r != queue_[Env::IO_HIGH].front()) &&
+ (queue_[Env::IO_LOW].empty() ||
+ &r != queue_[Env::IO_LOW].front()));
+ if (!queue_[Env::IO_HIGH].empty()) {
+ queue_[Env::IO_HIGH].front()->cv.Signal();
+ } else if (!queue_[Env::IO_LOW].empty()) {
+ queue_[Env::IO_LOW].front()->cv.Signal();
+ }
+ // Done
+ break;
+ }
+ } else {
+ // Spontaneous wake up, need to continue to wait
+ assert(!r.granted);
+ leader_ = nullptr;
+ }
+ } else {
+ // Waken up by previous leader:
+ // (1) if requested quota is granted, it is done.
+ // (2) if requested quota is not granted, this means current thread
+ // was picked as a new leader candidate (previous leader got quota).
+ // It needs to participate leader election because a new request may
+ // come in before this thread gets waken up. So it may actually need
+ // to do Wait() again.
+ assert(!timedout);
+ }
+ } while (!r.granted);
+}
+
+void GenericRateLimiter::Refill() {
+ TEST_SYNC_POINT("GenericRateLimiter::Refill");
+ next_refill_us_ = NowMicrosMonotonic(env_) + refill_period_us_;
+ // Carry over the left over quota from the last period
+ auto refill_bytes_per_period =
+ refill_bytes_per_period_.load(std::memory_order_relaxed);
+ if (available_bytes_ < refill_bytes_per_period) {
+ available_bytes_ += refill_bytes_per_period;
+ }
+
+ int use_low_pri_first = rnd_.OneIn(fairness_) ? 0 : 1;
+ for (int q = 0; q < 2; ++q) {
+ auto use_pri = (use_low_pri_first == q) ? Env::IO_LOW : Env::IO_HIGH;
+ auto* queue = &queue_[use_pri];
+ while (!queue->empty()) {
+ auto* next_req = queue->front();
+ if (available_bytes_ < next_req->request_bytes) {
+ // avoid starvation
+ next_req->request_bytes -= available_bytes_;
+ available_bytes_ = 0;
+ break;
+ }
+ available_bytes_ -= next_req->request_bytes;
+ next_req->request_bytes = 0;
+ total_bytes_through_[use_pri] += next_req->bytes;
+ queue->pop_front();
+
+ next_req->granted = true;
+ if (next_req != leader_) {
+ // Quota granted, signal the thread
+ next_req->cv.Signal();
+ }
+ }
+ }
+}
+
+int64_t GenericRateLimiter::CalculateRefillBytesPerPeriod(
+ int64_t rate_bytes_per_sec) {
+ if (port::kMaxInt64 / rate_bytes_per_sec < refill_period_us_) {
+ // Avoid unexpected result in the overflow case. The result now is still
+ // inaccurate but is a number that is large enough.
+ return port::kMaxInt64 / 1000000;
+ } else {
+ return std::max(kMinRefillBytesPerPeriod,
+ rate_bytes_per_sec * refill_period_us_ / 1000000);
+ }
+}
+
+Status GenericRateLimiter::Tune() {
+ const int kLowWatermarkPct = 50;
+ const int kHighWatermarkPct = 90;
+ const int kAdjustFactorPct = 5;
+ // computed rate limit will be in
+ // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`.
+ const int kAllowedRangeFactor = 20;
+
+ std::chrono::microseconds prev_tuned_time = tuned_time_;
+ tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_));
+
+ int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time +
+ std::chrono::microseconds(refill_period_us_) -
+ std::chrono::microseconds(1)) /
+ std::chrono::microseconds(refill_period_us_);
+ // We tune every kRefillsPerTune intervals, so the overflow and division-by-
+ // zero conditions should never happen.
+ assert(num_drains_ - prev_num_drains_ <= port::kMaxInt64 / 100);
+ assert(elapsed_intervals > 0);
+ int64_t drained_pct =
+ (num_drains_ - prev_num_drains_) * 100 / elapsed_intervals;
+
+ int64_t prev_bytes_per_sec = GetBytesPerSecond();
+ int64_t new_bytes_per_sec;
+ if (drained_pct == 0) {
+ new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor;
+ } else if (drained_pct < kLowWatermarkPct) {
+ // sanitize to prevent overflow
+ int64_t sanitized_prev_bytes_per_sec =
+ std::min(prev_bytes_per_sec, port::kMaxInt64 / 100);
+ new_bytes_per_sec =
+ std::max(max_bytes_per_sec_ / kAllowedRangeFactor,
+ sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct));
+ } else if (drained_pct > kHighWatermarkPct) {
+ // sanitize to prevent overflow
+ int64_t sanitized_prev_bytes_per_sec = std::min(
+ prev_bytes_per_sec, port::kMaxInt64 / (100 + kAdjustFactorPct));
+ new_bytes_per_sec =
+ std::min(max_bytes_per_sec_,
+ sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100);
+ } else {
+ new_bytes_per_sec = prev_bytes_per_sec;
+ }
+ if (new_bytes_per_sec != prev_bytes_per_sec) {
+ SetBytesPerSecond(new_bytes_per_sec);
+ }
+ num_drains_ = prev_num_drains_;
+ return Status::OK();
+}
+
+RateLimiter* NewGenericRateLimiter(
+ int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */,
+ int32_t fairness /* = 10 */,
+ RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */,
+ bool auto_tuned /* = false */) {
+ assert(rate_bytes_per_sec > 0);
+ assert(refill_period_us > 0);
+ assert(fairness > 0);
+ return new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
+ mode, Env::Default(), auto_tuned);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/rate_limiter.h b/src/rocksdb/util/rate_limiter.h
new file mode 100644
index 000000000..5f047a567
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <deque>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class GenericRateLimiter : public RateLimiter {
+ public:
+ GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
+ int32_t fairness, RateLimiter::Mode mode, Env* env,
+ bool auto_tuned);
+
+ virtual ~GenericRateLimiter();
+
+ // This API allows user to dynamically change rate limiter's bytes per second.
+ virtual void SetBytesPerSecond(int64_t bytes_per_second) override;
+
+ // Request for token to write bytes. If this request can not be satisfied,
+ // the call is blocked. Caller is responsible to make sure
+ // bytes <= GetSingleBurstBytes()
+ using RateLimiter::Request;
+ virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+ Statistics* stats) override;
+
+ virtual int64_t GetSingleBurstBytes() const override {
+ return refill_bytes_per_period_.load(std::memory_order_relaxed);
+ }
+
+ virtual int64_t GetTotalBytesThrough(
+ const Env::IOPriority pri = Env::IO_TOTAL) const override {
+ MutexLock g(&request_mutex_);
+ if (pri == Env::IO_TOTAL) {
+ return total_bytes_through_[Env::IO_LOW] +
+ total_bytes_through_[Env::IO_HIGH];
+ }
+ return total_bytes_through_[pri];
+ }
+
+ virtual int64_t GetTotalRequests(
+ const Env::IOPriority pri = Env::IO_TOTAL) const override {
+ MutexLock g(&request_mutex_);
+ if (pri == Env::IO_TOTAL) {
+ return total_requests_[Env::IO_LOW] + total_requests_[Env::IO_HIGH];
+ }
+ return total_requests_[pri];
+ }
+
+ virtual int64_t GetBytesPerSecond() const override {
+ return rate_bytes_per_sec_;
+ }
+
+ private:
+ void Refill();
+ int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec);
+ Status Tune();
+
+ uint64_t NowMicrosMonotonic(Env* env) {
+ return env->NowNanos() / std::milli::den;
+ }
+
+ // This mutex guard all internal states
+ mutable port::Mutex request_mutex_;
+
+ const int64_t kMinRefillBytesPerPeriod = 100;
+
+ const int64_t refill_period_us_;
+
+ int64_t rate_bytes_per_sec_;
+ // This variable can be changed dynamically.
+ std::atomic<int64_t> refill_bytes_per_period_;
+ Env* const env_;
+
+ bool stop_;
+ port::CondVar exit_cv_;
+ int32_t requests_to_wait_;
+
+ int64_t total_requests_[Env::IO_TOTAL];
+ int64_t total_bytes_through_[Env::IO_TOTAL];
+ int64_t available_bytes_;
+ int64_t next_refill_us_;
+
+ int32_t fairness_;
+ Random rnd_;
+
+ struct Req;
+ Req* leader_;
+ std::deque<Req*> queue_[Env::IO_TOTAL];
+
+ bool auto_tuned_;
+ int64_t num_drains_;
+ int64_t prev_num_drains_;
+ const int64_t max_bytes_per_sec_;
+ std::chrono::microseconds tuned_time_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/rate_limiter_test.cc b/src/rocksdb/util/rate_limiter_test.cc
new file mode 100644
index 000000000..6ca91e9e2
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter_test.cc
@@ -0,0 +1,235 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/rate_limiter.h"
+
+#include <chrono>
+#include <cinttypes>
+#include <limits>
+
+#include "db/db_test_util.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(yhchiang): the rate will not be accurate when we run test in parallel.
+class RateLimiterTest : public testing::Test {};
+
+TEST_F(RateLimiterTest, OverflowRate) {
+ GenericRateLimiter limiter(port::kMaxInt64, 1000, 10,
+ RateLimiter::Mode::kWritesOnly, Env::Default(),
+ false /* auto_tuned */);
+ ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll);
+}
+
+TEST_F(RateLimiterTest, StartStop) {
+ std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(100, 100, 10));
+}
+
+TEST_F(RateLimiterTest, Modes) {
+ for (auto mode : {RateLimiter::Mode::kWritesOnly,
+ RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) {
+ GenericRateLimiter limiter(
+ 2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+ 10 /* fairness */, mode, Env::Default(), false /* auto_tuned */);
+ limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kRead);
+ if (mode == RateLimiter::Mode::kWritesOnly) {
+ ASSERT_EQ(0, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+ } else {
+ ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+ }
+
+ limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ if (mode == RateLimiter::Mode::kAllIo) {
+ ASSERT_EQ(2000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+ } else {
+ ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+ }
+ }
+}
+
+#if !(defined(TRAVIS) && defined(OS_MACOSX))
+TEST_F(RateLimiterTest, Rate) {
+ auto* env = Env::Default();
+ struct Arg {
+ Arg(int32_t _target_rate, int _burst)
+ : limiter(NewGenericRateLimiter(_target_rate, 100 * 1000, 10)),
+ request_size(_target_rate / 10),
+ burst(_burst) {}
+ std::unique_ptr<RateLimiter> limiter;
+ int32_t request_size;
+ int burst;
+ };
+
+ auto writer = [](void* p) {
+ auto* thread_env = Env::Default();
+ auto* arg = static_cast<Arg*>(p);
+ // Test for 2 seconds
+ auto until = thread_env->NowMicros() + 2 * 1000000;
+ Random r((uint32_t)(thread_env->NowNanos() %
+ std::numeric_limits<uint32_t>::max()));
+ while (thread_env->NowMicros() < until) {
+ for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
+ arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
+ Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ }
+ arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW,
+ nullptr /* stats */, RateLimiter::OpType::kWrite);
+ }
+ };
+
+ for (int i = 1; i <= 16; i *= 2) {
+ int32_t target = i * 1024 * 10;
+ Arg arg(target, i / 4 + 1);
+ int64_t old_total_bytes_through = 0;
+ for (int iter = 1; iter <= 2; ++iter) {
+ // second iteration changes the target dynamically
+ if (iter == 2) {
+ target *= 2;
+ arg.limiter->SetBytesPerSecond(target);
+ }
+ auto start = env->NowMicros();
+ for (int t = 0; t < i; ++t) {
+ env->StartThread(writer, &arg);
+ }
+ env->WaitForJoin();
+
+ auto elapsed = env->NowMicros() - start;
+ double rate =
+ (arg.limiter->GetTotalBytesThrough() - old_total_bytes_through) *
+ 1000000.0 / elapsed;
+ old_total_bytes_through = arg.limiter->GetTotalBytesThrough();
+ fprintf(stderr,
+ "request size [1 - %" PRIi32 "], limit %" PRIi32
+ " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n",
+ arg.request_size - 1, target / 1024, rate / 1024,
+ elapsed / 1000000.0);
+
+ ASSERT_GE(rate / target, 0.80);
+ ASSERT_LE(rate / target, 1.25);
+ }
+ }
+}
+#endif
+
+TEST_F(RateLimiterTest, LimitChangeTest) {
+ // starvation test when limit changes to a smaller value
+ int64_t refill_period = 1000 * 1000;
+ auto* env = Env::Default();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ struct Arg {
+ Arg(int32_t _request_size, Env::IOPriority _pri,
+ std::shared_ptr<RateLimiter> _limiter)
+ : request_size(_request_size), pri(_pri), limiter(_limiter) {}
+ int32_t request_size;
+ Env::IOPriority pri;
+ std::shared_ptr<RateLimiter> limiter;
+ };
+
+ auto writer = [](void* p) {
+ auto* arg = static_cast<Arg*>(p);
+ arg->limiter->Request(arg->request_size, arg->pri, nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ };
+
+ for (uint32_t i = 1; i <= 16; i <<= 1) {
+ int32_t target = i * 1024 * 10;
+ // refill per second
+ for (int iter = 0; iter < 2; iter++) {
+ std::shared_ptr<RateLimiter> limiter =
+ std::make_shared<GenericRateLimiter>(
+ target, refill_period, 10, RateLimiter::Mode::kWritesOnly,
+ Env::Default(), false /* auto_tuned */);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"GenericRateLimiter::Request",
+ "RateLimiterTest::LimitChangeTest:changeLimitStart"},
+ {"RateLimiterTest::LimitChangeTest:changeLimitEnd",
+ "GenericRateLimiter::Refill"}});
+ Arg arg(target, Env::IO_HIGH, limiter);
+ // The idea behind is to start a request first, then before it refills,
+ // update limit to a different value (2X/0.5X). No starvation should
+ // be guaranteed under any situation
+ // TODO(lightmark): more test cases are welcome.
+ env->StartThread(writer, &arg);
+ int32_t new_limit = (target << 1) >> (iter << 1);
+ TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitStart");
+ arg.limiter->SetBytesPerSecond(new_limit);
+ TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitEnd");
+ env->WaitForJoin();
+ fprintf(stderr,
+ "[COMPLETE] request size %" PRIi32 " KB, new limit %" PRIi32
+ "KB/sec, refill period %" PRIi64 " ms\n",
+ target / 1024, new_limit / 1024, refill_period / 1000);
+ }
+ }
+}
+
+TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) {
+ const std::chrono::seconds kTimePerRefill(1);
+ const int kRefillsPerTune = 100; // needs to match util/rate_limiter.cc
+
+ SpecialEnv special_env(Env::Default());
+ special_env.no_slowdown_ = true;
+ special_env.time_elapse_only_sleep_ = true;
+
+ auto stats = CreateDBStatistics();
+ std::unique_ptr<RateLimiter> rate_limiter(new GenericRateLimiter(
+ 1000 /* rate_bytes_per_sec */,
+ std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */,
+ RateLimiter::Mode::kWritesOnly, &special_env, true /* auto_tuned */));
+
+ // Use callback to advance time because we need to advance (1) after Request()
+ // has determined the bytes are not available; and (2) before Refill()
+ // computes the next refill time (ensuring refill time in the future allows
+ // the next request to drain the rate limiter).
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "GenericRateLimiter::Refill", [&](void* /*arg*/) {
+ special_env.SleepForMicroseconds(static_cast<int>(
+ std::chrono::microseconds(kTimePerRefill).count()));
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ // verify rate limit increases after a sequence of periods where rate limiter
+ // is always drained
+ int64_t orig_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+ rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+ RateLimiter::OpType::kWrite);
+ while (std::chrono::microseconds(special_env.NowMicros()) <=
+ kRefillsPerTune * kTimePerRefill) {
+ rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+ RateLimiter::OpType::kWrite);
+ }
+ int64_t new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+ ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec);
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+ // decreases after a sequence of periods where rate limiter is not drained
+ orig_bytes_per_sec = new_bytes_per_sec;
+ special_env.SleepForMicroseconds(static_cast<int>(
+ kRefillsPerTune * std::chrono::microseconds(kTimePerRefill).count()));
+ // make a request so tuner can be triggered
+ rate_limiter->Request(1 /* bytes */, Env::IO_HIGH, stats.get(),
+ RateLimiter::OpType::kWrite);
+ new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+ ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/repeatable_thread.h b/src/rocksdb/util/repeatable_thread.h
new file mode 100644
index 000000000..1ac8edee6
--- /dev/null
+++ b/src/rocksdb/util/repeatable_thread.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/mock_time_env.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Simple wrapper around port::Thread that supports calling a callback every
+// X seconds. If you pass in 0, then it will call your callback repeatedly
+// without delay.
+class RepeatableThread {
+ public:
+ RepeatableThread(std::function<void()> function,
+ const std::string& thread_name, Env* env, uint64_t delay_us,
+ uint64_t initial_delay_us = 0)
+ : function_(function),
+ thread_name_("rocksdb:" + thread_name),
+ env_(env),
+ delay_us_(delay_us),
+ initial_delay_us_(initial_delay_us),
+ mutex_(env),
+ cond_var_(&mutex_),
+ running_(true),
+#ifndef NDEBUG
+ waiting_(false),
+ run_count_(0),
+#endif
+ thread_([this] { thread(); }) {
+ }
+
+ void cancel() {
+ {
+ InstrumentedMutexLock l(&mutex_);
+ if (!running_) {
+ return;
+ }
+ running_ = false;
+ cond_var_.SignalAll();
+ }
+ thread_.join();
+ }
+
+ bool IsRunning() { return running_; }
+
+ ~RepeatableThread() { cancel(); }
+
+#ifndef NDEBUG
+ // Wait until RepeatableThread starting waiting, call the optional callback,
+ // then wait for one run of RepeatableThread. Tests can use provide a
+ // custom env object to mock time, and use the callback here to bump current
+ // time and trigger RepeatableThread. See repeatable_thread_test for example.
+ //
+ // Note: only support one caller of this method.
+ void TEST_WaitForRun(std::function<void()> callback = nullptr) {
+ InstrumentedMutexLock l(&mutex_);
+ while (!waiting_) {
+ cond_var_.Wait();
+ }
+ uint64_t prev_count = run_count_;
+ if (callback != nullptr) {
+ callback();
+ }
+ cond_var_.SignalAll();
+ while (!(run_count_ > prev_count)) {
+ cond_var_.Wait();
+ }
+ }
+#endif
+
+ private:
+ bool wait(uint64_t delay) {
+ InstrumentedMutexLock l(&mutex_);
+ if (running_ && delay > 0) {
+ uint64_t wait_until = env_->NowMicros() + delay;
+#ifndef NDEBUG
+ waiting_ = true;
+ cond_var_.SignalAll();
+#endif
+ while (running_) {
+ cond_var_.TimedWait(wait_until);
+ if (env_->NowMicros() >= wait_until) {
+ break;
+ }
+ }
+#ifndef NDEBUG
+ waiting_ = false;
+#endif
+ }
+ return running_;
+ }
+
+ void thread() {
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+ // Set thread name.
+ auto thread_handle = thread_.native_handle();
+ int ret __attribute__((__unused__)) =
+ pthread_setname_np(thread_handle, thread_name_.c_str());
+ assert(ret == 0);
+#endif
+#endif
+
+ assert(delay_us_ > 0);
+ if (!wait(initial_delay_us_)) {
+ return;
+ }
+ do {
+ function_();
+#ifndef NDEBUG
+ {
+ InstrumentedMutexLock l(&mutex_);
+ run_count_++;
+ cond_var_.SignalAll();
+ }
+#endif
+ } while (wait(delay_us_));
+ }
+
+ const std::function<void()> function_;
+ const std::string thread_name_;
+ Env* const env_;
+ const uint64_t delay_us_;
+ const uint64_t initial_delay_us_;
+
+ // Mutex lock should be held when accessing running_, waiting_
+ // and run_count_.
+ InstrumentedMutex mutex_;
+ InstrumentedCondVar cond_var_;
+ bool running_;
+#ifndef NDEBUG
+ // RepeatableThread waiting for timeout.
+ bool waiting_;
+ // Times function_ had run.
+ uint64_t run_count_;
+#endif
+ port::Thread thread_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/repeatable_thread_test.cc b/src/rocksdb/util/repeatable_thread_test.cc
new file mode 100644
index 000000000..a18aa6cd1
--- /dev/null
+++ b/src/rocksdb/util/repeatable_thread_test.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/repeatable_thread.h"
+
+class RepeatableThreadTest : public testing::Test {
+ public:
+ RepeatableThreadTest()
+ : mock_env_(new ROCKSDB_NAMESPACE::MockTimeEnv(
+ ROCKSDB_NAMESPACE::Env::Default())) {}
+
+ protected:
+ std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env_;
+};
+
+TEST_F(RepeatableThreadTest, TimedTest) {
+ constexpr uint64_t kSecond = 1000000; // 1s = 1000000us
+ constexpr int kIteration = 3;
+ ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
+ ROCKSDB_NAMESPACE::port::Mutex mutex;
+ ROCKSDB_NAMESPACE::port::CondVar test_cv(&mutex);
+ int count = 0;
+ uint64_t prev_time = env->NowMicros();
+ ROCKSDB_NAMESPACE::RepeatableThread thread(
+ [&] {
+ ROCKSDB_NAMESPACE::MutexLock l(&mutex);
+ count++;
+ uint64_t now = env->NowMicros();
+ assert(count == 1 || prev_time + 1 * kSecond <= now);
+ prev_time = now;
+ if (count >= kIteration) {
+ test_cv.SignalAll();
+ }
+ },
+ "rt_test", env, 1 * kSecond);
+ // Wait for execution finish.
+ {
+ ROCKSDB_NAMESPACE::MutexLock l(&mutex);
+ while (count < kIteration) {
+ test_cv.Wait();
+ }
+ }
+
+ // Test cancel
+ thread.cancel();
+}
+
+TEST_F(RepeatableThreadTest, MockEnvTest) {
+ constexpr uint64_t kSecond = 1000000; // 1s = 1000000us
+ constexpr int kIteration = 3;
+ mock_env_->set_current_time(0); // in seconds
+ std::atomic<int> count{0};
+
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+ // Obtain the current (real) time in seconds and add 1000 extra seconds
+ // to ensure that RepeatableThread::wait invokes TimedWait with a time
+ // greater than (real) current time. This is to prevent the TimedWait
+ // function from returning immediately without sleeping and releasing
+ // the mutex on certain platforms, e.g. OS X. If TimedWait returns
+ // immediately, the mutex will not be released, and
+ // RepeatableThread::TEST_WaitForRun never has a chance to execute the
+ // callback which, in this case, updates the result returned by
+ // mock_env->NowMicros. Consequently, RepeatableThread::wait cannot
+ // break out of the loop, causing test to hang. The extra 1000 seconds
+ // is a best-effort approach because there seems no reliable and
+ // deterministic way to provide the aforementioned guarantee. By the
+ // time RepeatableThread::wait is called, it is no guarantee that the
+ // delay + mock_env->NowMicros will be greater than the current real
+ // time. However, 1000 seconds should be sufficient in most cases.
+ uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+ if (time_us < mock_env_->RealNowMicros()) {
+ *reinterpret_cast<uint64_t*>(arg) = mock_env_->RealNowMicros() + 1000;
+ }
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+#endif // OS_MACOSX && !NDEBUG
+
+ ROCKSDB_NAMESPACE::RepeatableThread thread(
+ [&] { count++; }, "rt_test", mock_env_.get(), 1 * kSecond, 1 * kSecond);
+ for (int i = 1; i <= kIteration; i++) {
+ // Bump current time
+ thread.TEST_WaitForRun([&] { mock_env_->set_current_time(i); });
+ }
+ // Test function should be exectued exactly kIteraion times.
+ ASSERT_EQ(kIteration, count.load());
+
+ // Test cancel
+ thread.cancel();
+}
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/set_comparator.h b/src/rocksdb/util/set_comparator.h
new file mode 100644
index 000000000..9b5cfc1dc
--- /dev/null
+++ b/src/rocksdb/util/set_comparator.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace ROCKSDB_NAMESPACE {
+// A comparator to be used in std::set
+struct SetComparator {
+ explicit SetComparator() : user_comparator_(BytewiseComparator()) {}
+ explicit SetComparator(const Comparator* user_comparator)
+ : user_comparator_(user_comparator ? user_comparator
+ : BytewiseComparator()) {}
+ bool operator()(const Slice& lhs, const Slice& rhs) const {
+ return user_comparator_->Compare(lhs, rhs) < 0;
+ }
+
+ private:
+ const Comparator* user_comparator_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/slice.cc b/src/rocksdb/util/slice.cc
new file mode 100644
index 000000000..6db11cc94
--- /dev/null
+++ b/src/rocksdb/util/slice.cc
@@ -0,0 +1,243 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/slice.h"
+#include "util/string_util.h"
+#include <stdio.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class FixedPrefixTransform : public SliceTransform {
+ private:
+ size_t prefix_len_;
+ std::string name_;
+
+ public:
+ explicit FixedPrefixTransform(size_t prefix_len)
+ : prefix_len_(prefix_len),
+ // Note that if any part of the name format changes, it will require
+ // changes on options_helper in order to make RocksDBOptionsParser work
+ // for the new change.
+ // TODO(yhchiang): move serialization / deserializaion code inside
+ // the class implementation itself.
+ name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {}
+
+ const char* Name() const override { return name_.c_str(); }
+
+ Slice Transform(const Slice& src) const override {
+ assert(InDomain(src));
+ return Slice(src.data(), prefix_len_);
+ }
+
+ bool InDomain(const Slice& src) const override {
+ return (src.size() >= prefix_len_);
+ }
+
+ bool InRange(const Slice& dst) const override {
+ return (dst.size() == prefix_len_);
+ }
+
+ bool FullLengthEnabled(size_t* len) const override {
+ *len = prefix_len_;
+ return true;
+ }
+
+ bool SameResultWhenAppended(const Slice& prefix) const override {
+ return InDomain(prefix);
+ }
+};
+
+class CappedPrefixTransform : public SliceTransform {
+ private:
+ size_t cap_len_;
+ std::string name_;
+
+ public:
+ explicit CappedPrefixTransform(size_t cap_len)
+ : cap_len_(cap_len),
+ // Note that if any part of the name format changes, it will require
+ // changes on options_helper in order to make RocksDBOptionsParser work
+ // for the new change.
+ // TODO(yhchiang): move serialization / deserializaion code inside
+ // the class implementation itself.
+ name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {}
+
+ const char* Name() const override { return name_.c_str(); }
+
+ Slice Transform(const Slice& src) const override {
+ assert(InDomain(src));
+ return Slice(src.data(), std::min(cap_len_, src.size()));
+ }
+
+ bool InDomain(const Slice& /*src*/) const override { return true; }
+
+ bool InRange(const Slice& dst) const override {
+ return (dst.size() <= cap_len_);
+ }
+
+ bool FullLengthEnabled(size_t* len) const override {
+ *len = cap_len_;
+ return true;
+ }
+
+ bool SameResultWhenAppended(const Slice& prefix) const override {
+ return prefix.size() >= cap_len_;
+ }
+};
+
+class NoopTransform : public SliceTransform {
+ public:
+ explicit NoopTransform() { }
+
+ const char* Name() const override { return "rocksdb.Noop"; }
+
+ Slice Transform(const Slice& src) const override { return src; }
+
+ bool InDomain(const Slice& /*src*/) const override { return true; }
+
+ bool InRange(const Slice& /*dst*/) const override { return true; }
+
+ bool SameResultWhenAppended(const Slice& /*prefix*/) const override {
+ return false;
+ }
+};
+
+}
+
+// 2 small internal utility functions, for efficient hex conversions
+// and no need for snprintf, toupper etc...
+// Originally from wdt/util/EncryptionUtils.cpp - for ToString(true)/DecodeHex:
+char toHex(unsigned char v) {
+ if (v <= 9) {
+ return '0' + v;
+ }
+ return 'A' + v - 10;
+}
+// most of the code is for validation/error check
+int fromHex(char c) {
+ // toupper:
+ if (c >= 'a' && c <= 'f') {
+ c -= ('a' - 'A'); // aka 0x20
+ }
+ // validation
+ if (c < '0' || (c > '9' && (c < 'A' || c > 'F'))) {
+ return -1; // invalid not 0-9A-F hex char
+ }
+ if (c <= '9') {
+ return c - '0';
+ }
+ return c - 'A' + 10;
+}
+
+Slice::Slice(const SliceParts& parts, std::string* buf) {
+ size_t length = 0;
+ for (int i = 0; i < parts.num_parts; ++i) {
+ length += parts.parts[i].size();
+ }
+ buf->reserve(length);
+
+ for (int i = 0; i < parts.num_parts; ++i) {
+ buf->append(parts.parts[i].data(), parts.parts[i].size());
+ }
+ data_ = buf->data();
+ size_ = buf->size();
+}
+
+// Return a string that contains the copy of the referenced data.
+std::string Slice::ToString(bool hex) const {
+ std::string result; // RVO/NRVO/move
+ if (hex) {
+ result.reserve(2 * size_);
+ for (size_t i = 0; i < size_; ++i) {
+ unsigned char c = data_[i];
+ result.push_back(toHex(c >> 4));
+ result.push_back(toHex(c & 0xf));
+ }
+ return result;
+ } else {
+ result.assign(data_, size_);
+ return result;
+ }
+}
+
+// Originally from rocksdb/utilities/ldb_cmd.h
+bool Slice::DecodeHex(std::string* result) const {
+ std::string::size_type len = size_;
+ if (len % 2) {
+ // Hex string must be even number of hex digits to get complete bytes back
+ return false;
+ }
+ if (!result) {
+ return false;
+ }
+ result->clear();
+ result->reserve(len / 2);
+
+ for (size_t i = 0; i < len;) {
+ int h1 = fromHex(data_[i++]);
+ if (h1 < 0) {
+ return false;
+ }
+ int h2 = fromHex(data_[i++]);
+ if (h2 < 0) {
+ return false;
+ }
+ result->push_back(static_cast<char>((h1 << 4) | h2));
+ }
+ return true;
+}
+
+const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
+ return new FixedPrefixTransform(prefix_len);
+}
+
+const SliceTransform* NewCappedPrefixTransform(size_t cap_len) {
+ return new CappedPrefixTransform(cap_len);
+}
+
+const SliceTransform* NewNoopTransform() {
+ return new NoopTransform;
+}
+
+PinnableSlice::PinnableSlice(PinnableSlice&& other) {
+ *this = std::move(other);
+}
+
+PinnableSlice& PinnableSlice::operator=(PinnableSlice&& other) {
+ if (this != &other) {
+ Cleanable::Reset();
+ Cleanable::operator=(std::move(other));
+ size_ = other.size_;
+ pinned_ = other.pinned_;
+ if (pinned_) {
+ data_ = other.data_;
+ // When it's pinned, buf should no longer be of use.
+ } else {
+ if (other.buf_ == &other.self_space_) {
+ self_space_ = std::move(other.self_space_);
+ buf_ = &self_space_;
+ data_ = buf_->data();
+ } else {
+ buf_ = other.buf_;
+ data_ = other.data_;
+ }
+ }
+ other.self_space_.clear();
+ other.buf_ = &other.self_space_;
+ other.pinned_ = false;
+ other.PinSelf();
+ }
+ return *this;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/slice_test.cc b/src/rocksdb/util/slice_test.cc
new file mode 100644
index 000000000..9e8a8e340
--- /dev/null
+++ b/src/rocksdb/util/slice_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Use this to keep track of the cleanups that were actually performed
+void Multiplier(void* arg1, void* arg2) {
+ int* res = reinterpret_cast<int*>(arg1);
+ int* num = reinterpret_cast<int*>(arg2);
+ *res *= *num;
+}
+
+class PinnableSliceTest : public testing::Test {
+ public:
+ void AssertSameData(const std::string& expected,
+ const PinnableSlice& slice) {
+ std::string got;
+ got.assign(slice.data(), slice.size());
+ ASSERT_EQ(expected, got);
+ }
+};
+
+// Test that the external buffer is moved instead of being copied.
+TEST_F(PinnableSliceTest, MoveExternalBuffer) {
+ Slice s("123");
+ std::string buf;
+ PinnableSlice v1(&buf);
+ v1.PinSelf(s);
+
+ PinnableSlice v2(std::move(v1));
+ ASSERT_EQ(buf.data(), v2.data());
+ ASSERT_EQ(&buf, v2.GetSelf());
+
+ PinnableSlice v3;
+ v3 = std::move(v2);
+ ASSERT_EQ(buf.data(), v3.data());
+ ASSERT_EQ(&buf, v3.GetSelf());
+}
+
+TEST_F(PinnableSliceTest, Move) {
+ int n2 = 2;
+ int res = 1;
+ const std::string const_str1 = "123";
+ const std::string const_str2 = "ABC";
+ Slice slice1(const_str1);
+ Slice slice2(const_str2);
+
+ {
+ // Test move constructor on a pinned slice.
+ res = 1;
+ PinnableSlice v1;
+ v1.PinSlice(slice1, Multiplier, &res, &n2);
+ PinnableSlice v2(std::move(v1));
+
+ // Since v1's Cleanable has been moved to v2,
+ // no cleanup should happen in Reset.
+ v1.Reset();
+ ASSERT_EQ(1, res);
+
+ AssertSameData(const_str1, v2);
+ }
+ // v2 is cleaned up.
+ ASSERT_EQ(2, res);
+
+ {
+ // Test move constructor on an unpinned slice.
+ PinnableSlice v1;
+ v1.PinSelf(slice1);
+ PinnableSlice v2(std::move(v1));
+
+ AssertSameData(const_str1, v2);
+ }
+
+ {
+ // Test move assignment from a pinned slice to
+ // another pinned slice.
+ res = 1;
+ PinnableSlice v1;
+ v1.PinSlice(slice1, Multiplier, &res, &n2);
+ PinnableSlice v2;
+ v2.PinSlice(slice2, Multiplier, &res, &n2);
+ v2 = std::move(v1);
+
+ // v2's Cleanable will be Reset before moving
+ // anything from v1.
+ ASSERT_EQ(2, res);
+ // Since v1's Cleanable has been moved to v2,
+ // no cleanup should happen in Reset.
+ v1.Reset();
+ ASSERT_EQ(2, res);
+
+ AssertSameData(const_str1, v2);
+ }
+ // The Cleanable moved from v1 to v2 will be Reset.
+ ASSERT_EQ(4, res);
+
+ {
+ // Test move assignment from a pinned slice to
+ // an unpinned slice.
+ res = 1;
+ PinnableSlice v1;
+ v1.PinSlice(slice1, Multiplier, &res, &n2);
+ PinnableSlice v2;
+ v2.PinSelf(slice2);
+ v2 = std::move(v1);
+
+ // Since v1's Cleanable has been moved to v2,
+ // no cleanup should happen in Reset.
+ v1.Reset();
+ ASSERT_EQ(1, res);
+
+ AssertSameData(const_str1, v2);
+ }
+ // The Cleanable moved from v1 to v2 will be Reset.
+ ASSERT_EQ(2, res);
+
+ {
+ // Test move assignment from an upinned slice to
+ // another unpinned slice.
+ PinnableSlice v1;
+ v1.PinSelf(slice1);
+ PinnableSlice v2;
+ v2.PinSelf(slice2);
+ v2 = std::move(v1);
+
+ AssertSameData(const_str1, v2);
+ }
+
+ {
+ // Test move assignment from an upinned slice to
+ // a pinned slice.
+ res = 1;
+ PinnableSlice v1;
+ v1.PinSelf(slice1);
+ PinnableSlice v2;
+ v2.PinSlice(slice2, Multiplier, &res, &n2);
+ v2 = std::move(v1);
+
+ // v2's Cleanable will be Reset before moving
+ // anything from v1.
+ ASSERT_EQ(2, res);
+
+ AssertSameData(const_str1, v2);
+ }
+ // No Cleanable is moved from v1 to v2, so no more cleanup.
+ ASSERT_EQ(2, res);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/slice_transform_test.cc b/src/rocksdb/util/slice_transform_test.cc
new file mode 100644
index 000000000..6550cadea
--- /dev/null
+++ b/src/rocksdb/util/slice_transform_test.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice_transform.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SliceTransformTest : public testing::Test {};
+
+TEST_F(SliceTransformTest, CapPrefixTransform) {
+ std::string s;
+ s = "abcdefge";
+
+ std::unique_ptr<const SliceTransform> transform;
+
+ transform.reset(NewCappedPrefixTransform(6));
+ ASSERT_EQ(transform->Transform(s).ToString(), "abcdef");
+ ASSERT_TRUE(transform->SameResultWhenAppended("123456"));
+ ASSERT_TRUE(transform->SameResultWhenAppended("1234567"));
+ ASSERT_TRUE(!transform->SameResultWhenAppended("12345"));
+
+ transform.reset(NewCappedPrefixTransform(8));
+ ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+ transform.reset(NewCappedPrefixTransform(10));
+ ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+ transform.reset(NewCappedPrefixTransform(0));
+ ASSERT_EQ(transform->Transform(s).ToString(), "");
+
+ transform.reset(NewCappedPrefixTransform(0));
+ ASSERT_EQ(transform->Transform("").ToString(), "");
+}
+
+class SliceTransformDBTest : public testing::Test {
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+
+ public:
+ SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
+ dbname_ = test::PerThreadDBPath("slice_transform_db_test");
+ EXPECT_OK(DestroyDB(dbname_, last_options_));
+ }
+
+ ~SliceTransformDBTest() override {
+ delete db_;
+ EXPECT_OK(DestroyDB(dbname_, last_options_));
+ }
+
+ DB* db() { return db_; }
+
+ // Return the current option configuration.
+ Options* GetOptions() { return &last_options_; }
+
+ void DestroyAndReopen() {
+ // Destroy using last options
+ Destroy();
+ ASSERT_OK(TryReopen());
+ }
+
+ void Destroy() {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(DestroyDB(dbname_, last_options_));
+ }
+
+ Status TryReopen() {
+ delete db_;
+ db_ = nullptr;
+ last_options_.create_if_missing = true;
+
+ return DB::Open(last_options_, dbname_, &db_);
+ }
+
+ Options last_options_;
+};
+
+namespace {
+uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+ return options.statistics->getTickerCount(ticker_type);
+}
+} // namespace
+
+TEST_F(SliceTransformDBTest, CapPrefix) {
+ last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8));
+ last_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+ bbto.whole_key_filtering = false;
+ last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ ASSERT_OK(TryReopen());
+
+ ReadOptions ro;
+ FlushOptions fo;
+ WriteOptions wo;
+
+ ASSERT_OK(db()->Put(wo, "barbarbar", "foo"));
+ ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2"));
+ ASSERT_OK(db()->Put(wo, "foo", "bar"));
+ ASSERT_OK(db()->Put(wo, "foo3", "bar3"));
+ ASSERT_OK(db()->Flush(fo));
+
+ std::unique_ptr<Iterator> iter(db()->NewIterator(ro));
+
+ iter->Seek("foo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->value().ToString(), "bar");
+ ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U);
+
+ iter->Seek("foo2");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+ iter->Seek("barbarbar");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->value().ToString(), "foo");
+ ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+ iter->Seek("barfoofoo");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U);
+
+ iter->Seek("foobarbar");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(!iter->Valid());
+ ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/status.cc b/src/rocksdb/util/status.cc
new file mode 100644
index 000000000..3b1ffde56
--- /dev/null
+++ b/src/rocksdb/util/status.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/status.h"
+#include <stdio.h>
+#ifdef OS_WIN
+#include <string.h>
+#endif
+#include <cstring>
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* Status::CopyState(const char* state) {
+#ifdef OS_WIN
+ const size_t cch = std::strlen(state) + 1; // +1 for the null terminator
+ char* result = new char[cch];
+ errno_t ret
+#if defined(_MSC_VER)
+ ;
+#else
+ __attribute__((__unused__));
+#endif
+ ret = strncpy_s(result, cch, state, cch - 1);
+ result[cch - 1] = '\0';
+ assert(ret == 0);
+ return result;
+#else
+ const size_t cch = std::strlen(state) + 1; // +1 for the null terminator
+ return std::strncpy(new char[cch], state, cch);
+#endif
+}
+
+static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
+ "", // kNone
+ "Timeout Acquiring Mutex", // kMutexTimeout
+ "Timeout waiting to lock key", // kLockTimeout
+ "Failed to acquire lock due to max_num_locks limit", // kLockLimit
+ "No space left on device", // kNoSpace
+ "Deadlock", // kDeadlock
+ "Stale file handle", // kStaleFile
+ "Memory limit reached", // kMemoryLimit
+ "Space limit reached", // kSpaceLimit
+ "No such file or directory", // kPathNotFound
+ // KMergeOperandsInsufficientCapacity
+ "Insufficient capacity for merge operands",
+ // kManualCompactionPaused
+ "Manual compaction paused",
+};
+
+Status::Status(Code _code, SubCode _subcode, const Slice& msg,
+ const Slice& msg2)
+ : code_(_code), subcode_(_subcode), sev_(kNoError) {
+ assert(code_ != kOk);
+ assert(subcode_ != kMaxSubCode);
+ const size_t len1 = msg.size();
+ const size_t len2 = msg2.size();
+ const size_t size = len1 + (len2 ? (2 + len2) : 0);
+ char* const result = new char[size + 1]; // +1 for null terminator
+ memcpy(result, msg.data(), len1);
+ if (len2) {
+ result[len1] = ':';
+ result[len1 + 1] = ' ';
+ memcpy(result + len1 + 2, msg2.data(), len2);
+ }
+ result[size] = '\0'; // null terminator for C style string
+ state_ = result;
+}
+
+std::string Status::ToString() const {
+ char tmp[30];
+ const char* type;
+ switch (code_) {
+ case kOk:
+ return "OK";
+ case kNotFound:
+ type = "NotFound: ";
+ break;
+ case kCorruption:
+ type = "Corruption: ";
+ break;
+ case kNotSupported:
+ type = "Not implemented: ";
+ break;
+ case kInvalidArgument:
+ type = "Invalid argument: ";
+ break;
+ case kIOError:
+ type = "IO error: ";
+ break;
+ case kMergeInProgress:
+ type = "Merge in progress: ";
+ break;
+ case kIncomplete:
+ type = "Result incomplete: ";
+ break;
+ case kShutdownInProgress:
+ type = "Shutdown in progress: ";
+ break;
+ case kTimedOut:
+ type = "Operation timed out: ";
+ break;
+ case kAborted:
+ type = "Operation aborted: ";
+ break;
+ case kBusy:
+ type = "Resource busy: ";
+ break;
+ case kExpired:
+ type = "Operation expired: ";
+ break;
+ case kTryAgain:
+ type = "Operation failed. Try again.: ";
+ break;
+ case kColumnFamilyDropped:
+ type = "Column family dropped: ";
+ break;
+ default:
+ snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
+ static_cast<int>(code()));
+ type = tmp;
+ break;
+ }
+ std::string result(type);
+ if (subcode_ != kNone) {
+ uint32_t index = static_cast<int32_t>(subcode_);
+ assert(sizeof(msgs) > index);
+ result.append(msgs[index]);
+ }
+
+ if (state_ != nullptr) {
+ result.append(state_);
+ }
+ return result;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/stderr_logger.h b/src/rocksdb/util/stderr_logger.h
new file mode 100644
index 000000000..abf8f5701
--- /dev/null
+++ b/src/rocksdb/util/stderr_logger.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Prints logs to stderr for faster debugging
+class StderrLogger : public Logger {
+ public:
+ explicit StderrLogger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+ : Logger(log_level) {}
+
+ // Brings overloaded Logv()s into scope so they're not hidden when we override
+ // a subset of them.
+ using Logger::Logv;
+
+ virtual void Logv(const char* format, va_list ap) override {
+ vfprintf(stderr, format, ap);
+ fprintf(stderr, "\n");
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/stop_watch.h b/src/rocksdb/util/stop_watch.h
new file mode 100644
index 000000000..ad4905960
--- /dev/null
+++ b/src/rocksdb/util/stop_watch.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "monitoring/statistics.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Auto-scoped.
+// Records the measure time into the corresponding histogram if statistics
+// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr
+// and overwrite is true, it will be added to *elapsed if overwrite is false.
+class StopWatch {
+ public:
+ StopWatch(Env* const env, Statistics* statistics, const uint32_t hist_type,
+ uint64_t* elapsed = nullptr, bool overwrite = true,
+ bool delay_enabled = false)
+ : env_(env),
+ statistics_(statistics),
+ hist_type_(hist_type),
+ elapsed_(elapsed),
+ overwrite_(overwrite),
+ stats_enabled_(statistics &&
+ statistics->get_stats_level() >=
+ StatsLevel::kExceptTimers &&
+ statistics->HistEnabledForType(hist_type)),
+ delay_enabled_(delay_enabled),
+ total_delay_(0),
+ delay_start_time_(0),
+ start_time_((stats_enabled_ || elapsed != nullptr) ? env->NowMicros()
+ : 0) {}
+
+ ~StopWatch() {
+ if (elapsed_) {
+ if (overwrite_) {
+ *elapsed_ = env_->NowMicros() - start_time_;
+ } else {
+ *elapsed_ += env_->NowMicros() - start_time_;
+ }
+ }
+ if (elapsed_ && delay_enabled_) {
+ *elapsed_ -= total_delay_;
+ }
+ if (stats_enabled_) {
+ statistics_->reportTimeToHistogram(
+ hist_type_, (elapsed_ != nullptr)
+ ? *elapsed_
+ : (env_->NowMicros() - start_time_));
+ }
+ }
+
+ void DelayStart() {
+ // if delay_start_time_ is not 0, it means we are already tracking delay,
+ // so delay_start_time_ should not be overwritten
+ if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) {
+ delay_start_time_ = env_->NowMicros();
+ }
+ }
+
+ void DelayStop() {
+ if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) {
+ total_delay_ += env_->NowMicros() - delay_start_time_;
+ }
+ // reset to 0 means currently no delay is being tracked, so two consecutive
+ // calls to DelayStop will not increase total_delay_
+ delay_start_time_ = 0;
+ }
+
+ uint64_t GetDelay() const { return delay_enabled_ ? total_delay_ : 0; }
+
+ uint64_t start_time() const { return start_time_; }
+
+ private:
+ Env* const env_;
+ Statistics* statistics_;
+ const uint32_t hist_type_;
+ uint64_t* elapsed_;
+ bool overwrite_;
+ bool stats_enabled_;
+ bool delay_enabled_;
+ uint64_t total_delay_;
+ uint64_t delay_start_time_;
+ const uint64_t start_time_;
+};
+
+// a nano second precision stopwatch
+class StopWatchNano {
+ public:
+ explicit StopWatchNano(Env* const env, bool auto_start = false)
+ : env_(env), start_(0) {
+ if (auto_start) {
+ Start();
+ }
+ }
+
+ void Start() { start_ = env_->NowNanos(); }
+
+ uint64_t ElapsedNanos(bool reset = false) {
+ auto now = env_->NowNanos();
+ auto elapsed = now - start_;
+ if (reset) {
+ start_ = now;
+ }
+ return elapsed;
+ }
+
+ uint64_t ElapsedNanosSafe(bool reset = false) {
+ return (env_ != nullptr) ? ElapsedNanos(reset) : 0U;
+ }
+
+ private:
+ Env* const env_;
+ uint64_t start_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/string_util.cc b/src/rocksdb/util/string_util.cc
new file mode 100644
index 000000000..bb234f239
--- /dev/null
+++ b/src/rocksdb/util/string_util.cc
@@ -0,0 +1,409 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "util/string_util.h"
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "port/port.h"
+#include "port/sys_time.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kNullptrString = "nullptr";
+
+std::vector<std::string> StringSplit(const std::string& arg, char delim) {
+ std::vector<std::string> splits;
+ std::stringstream ss(arg);
+ std::string item;
+ while (std::getline(ss, item, delim)) {
+ splits.push_back(item);
+ }
+ return splits;
+}
+
+// for micros < 10ms, print "XX us".
+// for micros < 10sec, print "XX ms".
+// for micros >= 10 sec, print "XX sec".
+// for micros <= 1 hour, print Y:X M:S".
+// for micros > 1 hour, print Z:Y:X H:M:S".
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+ bool fixed_format) {
+ if (micros < 10000 && !fixed_format) {
+ return snprintf(output, len, "%" PRIu64 " us", micros);
+ } else if (micros < 10000000 && !fixed_format) {
+ return snprintf(output, len, "%.3lf ms",
+ static_cast<double>(micros) / 1000);
+ } else if (micros < 1000000l * 60 && !fixed_format) {
+ return snprintf(output, len, "%.3lf sec",
+ static_cast<double>(micros) / 1000000);
+ } else if (micros < 1000000ll * 60 * 60 && !fixed_format) {
+ return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S",
+ micros / 1000000 / 60,
+ static_cast<double>(micros % 60000000) / 1000000);
+ } else {
+ return snprintf(output, len, "%02" PRIu64 ":%02" PRIu64 ":%05.3f H:M:S",
+ micros / 1000000 / 3600, (micros / 1000000 / 60) % 60,
+ static_cast<double>(micros % 60000000) / 1000000);
+ }
+}
+
+// for sizes >=10TB, print "XXTB"
+// for sizes >=10GB, print "XXGB"
+// etc.
+// append file size summary to output and return the len
+int AppendHumanBytes(uint64_t bytes, char* output, int len) {
+ const uint64_t ull10 = 10;
+ if (bytes >= ull10 << 40) {
+ return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40);
+ } else if (bytes >= ull10 << 30) {
+ return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30);
+ } else if (bytes >= ull10 << 20) {
+ return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20);
+ } else if (bytes >= ull10 << 10) {
+ return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10);
+ } else {
+ return snprintf(output, len, "%" PRIu64 "B", bytes);
+ }
+}
+
+void AppendNumberTo(std::string* str, uint64_t num) {
+ char buf[30];
+ snprintf(buf, sizeof(buf), "%" PRIu64, num);
+ str->append(buf);
+}
+
+void AppendEscapedStringTo(std::string* str, const Slice& value) {
+ for (size_t i = 0; i < value.size(); i++) {
+ char c = value[i];
+ if (c >= ' ' && c <= '~') {
+ str->push_back(c);
+ } else {
+ char buf[10];
+ snprintf(buf, sizeof(buf), "\\x%02x",
+ static_cast<unsigned int>(c) & 0xff);
+ str->append(buf);
+ }
+ }
+}
+
+std::string NumberToString(uint64_t num) {
+ std::string r;
+ AppendNumberTo(&r, num);
+ return r;
+}
+
+std::string NumberToHumanString(int64_t num) {
+ char buf[19];
+ int64_t absnum = num < 0 ? -num : num;
+ if (absnum < 10000) {
+ snprintf(buf, sizeof(buf), "%" PRIi64, num);
+ } else if (absnum < 10000000) {
+ snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000);
+ } else if (absnum < 10000000000LL) {
+ snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000);
+ } else {
+ snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000);
+ }
+ return std::string(buf);
+}
+
+std::string BytesToHumanString(uint64_t bytes) {
+ const char* size_name[] = {"KB", "MB", "GB", "TB"};
+ double final_size = static_cast<double>(bytes);
+ size_t size_idx;
+
+ // always start with KB
+ final_size /= 1024;
+ size_idx = 0;
+
+ while (size_idx < 3 && final_size >= 1024) {
+ final_size /= 1024;
+ size_idx++;
+ }
+
+ char buf[20];
+ snprintf(buf, sizeof(buf), "%.2f %s", final_size, size_name[size_idx]);
+ return std::string(buf);
+}
+
+std::string TimeToHumanString(int unixtime) {
+ char time_buffer[80];
+ time_t rawtime = unixtime;
+ struct tm tInfo;
+ struct tm* timeinfo = localtime_r(&rawtime, &tInfo);
+ assert(timeinfo == &tInfo);
+ strftime(time_buffer, 80, "%c", timeinfo);
+ return std::string(time_buffer);
+}
+
+std::string EscapeString(const Slice& value) {
+ std::string r;
+ AppendEscapedStringTo(&r, value);
+ return r;
+}
+
+bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
+ uint64_t v = 0;
+ int digits = 0;
+ while (!in->empty()) {
+ char c = (*in)[0];
+ if (c >= '0' && c <= '9') {
+ ++digits;
+ const unsigned int delta = (c - '0');
+ static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
+ if (v > kMaxUint64 / 10 ||
+ (v == kMaxUint64 / 10 && delta > kMaxUint64 % 10)) {
+ // Overflow
+ return false;
+ }
+ v = (v * 10) + delta;
+ in->remove_prefix(1);
+ } else {
+ break;
+ }
+ }
+ *val = v;
+ return (digits > 0);
+}
+
+bool isSpecialChar(const char c) {
+ if (c == '\\' || c == '#' || c == ':' || c == '\r' || c == '\n') {
+ return true;
+ }
+ return false;
+}
+
+namespace {
+using CharMap = std::pair<char, char>;
+}
+
+char UnescapeChar(const char c) {
+ static const CharMap convert_map[] = {{'r', '\r'}, {'n', '\n'}};
+
+ auto iter = std::find_if(std::begin(convert_map), std::end(convert_map),
+ [c](const CharMap& p) { return p.first == c; });
+
+ if (iter == std::end(convert_map)) {
+ return c;
+ }
+ return iter->second;
+}
+
+char EscapeChar(const char c) {
+ static const CharMap convert_map[] = {{'\n', 'n'}, {'\r', 'r'}};
+
+ auto iter = std::find_if(std::begin(convert_map), std::end(convert_map),
+ [c](const CharMap& p) { return p.first == c; });
+
+ if (iter == std::end(convert_map)) {
+ return c;
+ }
+ return iter->second;
+}
+
+std::string EscapeOptionString(const std::string& raw_string) {
+ std::string output;
+ for (auto c : raw_string) {
+ if (isSpecialChar(c)) {
+ output += '\\';
+ output += EscapeChar(c);
+ } else {
+ output += c;
+ }
+ }
+
+ return output;
+}
+
+std::string UnescapeOptionString(const std::string& escaped_string) {
+ bool escaped = false;
+ std::string output;
+
+ for (auto c : escaped_string) {
+ if (escaped) {
+ output += UnescapeChar(c);
+ escaped = false;
+ } else {
+ if (c == '\\') {
+ escaped = true;
+ continue;
+ }
+ output += c;
+ }
+ }
+ return output;
+}
+
+std::string trim(const std::string& str) {
+ if (str.empty()) return std::string();
+ size_t start = 0;
+ size_t end = str.size() - 1;
+ while (isspace(str[start]) != 0 && start < end) {
+ ++start;
+ }
+ while (isspace(str[end]) != 0 && start < end) {
+ --end;
+ }
+ if (start <= end) {
+ return str.substr(start, end - start + 1);
+ }
+ return std::string();
+}
+
+#ifndef ROCKSDB_LITE
+
+bool ParseBoolean(const std::string& type, const std::string& value) {
+ if (value == "true" || value == "1") {
+ return true;
+ } else if (value == "false" || value == "0") {
+ return false;
+ }
+ throw std::invalid_argument(type);
+}
+
+uint32_t ParseUint32(const std::string& value) {
+ uint64_t num = ParseUint64(value);
+ if ((num >> 32LL) == 0) {
+ return static_cast<uint32_t>(num);
+ } else {
+ throw std::out_of_range(value);
+ }
+}
+
+int32_t ParseInt32(const std::string& value) {
+ int64_t num = ParseInt64(value);
+ if (num <= port::kMaxInt32 && num >= port::kMinInt32) {
+ return static_cast<int32_t>(num);
+ } else {
+ throw std::out_of_range(value);
+ }
+}
+
+#endif
+
+uint64_t ParseUint64(const std::string& value) {
+ size_t endchar;
+#ifndef CYGWIN
+ uint64_t num = std::stoull(value.c_str(), &endchar);
+#else
+ char* endptr;
+ uint64_t num = std::strtoul(value.c_str(), &endptr, 0);
+ endchar = endptr - value.c_str();
+#endif
+
+ if (endchar < value.length()) {
+ char c = value[endchar];
+ if (c == 'k' || c == 'K')
+ num <<= 10LL;
+ else if (c == 'm' || c == 'M')
+ num <<= 20LL;
+ else if (c == 'g' || c == 'G')
+ num <<= 30LL;
+ else if (c == 't' || c == 'T')
+ num <<= 40LL;
+ }
+
+ return num;
+}
+
+int64_t ParseInt64(const std::string& value) {
+ size_t endchar;
+#ifndef CYGWIN
+ int64_t num = std::stoll(value.c_str(), &endchar);
+#else
+ char* endptr;
+ int64_t num = std::strtoll(value.c_str(), &endptr, 0);
+ endchar = endptr - value.c_str();
+#endif
+
+ if (endchar < value.length()) {
+ char c = value[endchar];
+ if (c == 'k' || c == 'K')
+ num <<= 10LL;
+ else if (c == 'm' || c == 'M')
+ num <<= 20LL;
+ else if (c == 'g' || c == 'G')
+ num <<= 30LL;
+ else if (c == 't' || c == 'T')
+ num <<= 40LL;
+ }
+
+ return num;
+}
+
+int ParseInt(const std::string& value) {
+ size_t endchar;
+#ifndef CYGWIN
+ int num = std::stoi(value.c_str(), &endchar);
+#else
+ char* endptr;
+ int num = std::strtoul(value.c_str(), &endptr, 0);
+ endchar = endptr - value.c_str();
+#endif
+
+ if (endchar < value.length()) {
+ char c = value[endchar];
+ if (c == 'k' || c == 'K')
+ num <<= 10;
+ else if (c == 'm' || c == 'M')
+ num <<= 20;
+ else if (c == 'g' || c == 'G')
+ num <<= 30;
+ }
+
+ return num;
+}
+
+double ParseDouble(const std::string& value) {
+#ifndef CYGWIN
+ return std::stod(value);
+#else
+ return std::strtod(value.c_str(), 0);
+#endif
+}
+
+size_t ParseSizeT(const std::string& value) {
+ return static_cast<size_t>(ParseUint64(value));
+}
+
+std::vector<int> ParseVectorInt(const std::string& value) {
+ std::vector<int> result;
+ size_t start = 0;
+ while (start < value.size()) {
+ size_t end = value.find(':', start);
+ if (end == std::string::npos) {
+ result.push_back(ParseInt(value.substr(start)));
+ break;
+ } else {
+ result.push_back(ParseInt(value.substr(start, end - start)));
+ start = end + 1;
+ }
+ }
+ return result;
+}
+
+bool SerializeIntVector(const std::vector<int>& vec, std::string* value) {
+ *value = "";
+ for (size_t i = 0; i < vec.size(); ++i) {
+ if (i > 0) {
+ *value += ":";
+ }
+ *value += ToString(vec[i]);
+ }
+ return true;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/string_util.h b/src/rocksdb/util/string_util.h
new file mode 100644
index 000000000..a761be66c
--- /dev/null
+++ b/src/rocksdb/util/string_util.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+extern std::vector<std::string> StringSplit(const std::string& arg, char delim);
+
+template <typename T>
+inline std::string ToString(T value) {
+#if !(defined OS_ANDROID) && !(defined CYGWIN) && !(defined OS_FREEBSD)
+ return std::to_string(value);
+#else
+ // Andorid or cygwin doesn't support all of C++11, std::to_string() being
+ // one of the not supported features.
+ std::ostringstream os;
+ os << value;
+ return os.str();
+#endif
+}
+
+// Append a human-readable printout of "num" to *str
+extern void AppendNumberTo(std::string* str, uint64_t num);
+
+// Append a human-readable printout of "value" to *str.
+// Escapes any non-printable characters found in "value".
+extern void AppendEscapedStringTo(std::string* str, const Slice& value);
+
+// Return a string printout of "num"
+extern std::string NumberToString(uint64_t num);
+
+// Return a human-readable version of num.
+// for num >= 10.000, prints "xxK"
+// for num >= 10.000.000, prints "xxM"
+// for num >= 10.000.000.000, prints "xxG"
+extern std::string NumberToHumanString(int64_t num);
+
+// Return a human-readable version of bytes
+// ex: 1048576 -> 1.00 GB
+extern std::string BytesToHumanString(uint64_t bytes);
+
+// Return a human-readable version of unix time
+// ex: 1562116015 -> "Tue Jul 2 18:06:55 2019"
+extern std::string TimeToHumanString(int unixtime);
+
+// Append a human-readable time in micros.
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+ bool fixed_format);
+
+// Append a human-readable size in bytes
+int AppendHumanBytes(uint64_t bytes, char* output, int len);
+
+// Return a human-readable version of "value".
+// Escapes any non-printable characters found in "value".
+extern std::string EscapeString(const Slice& value);
+
+// Parse a human-readable number from "*in" into *value. On success,
+// advances "*in" past the consumed number and sets "*val" to the
+// numeric value. Otherwise, returns false and leaves *in in an
+// unspecified state.
+extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val);
+
+// Returns true if the input char "c" is considered as a special character
+// that will be escaped when EscapeOptionString() is called.
+//
+// @param c the input char
+// @return true if the input char "c" is considered as a special character.
+// @see EscapeOptionString
+bool isSpecialChar(const char c);
+
+// If the input char is an escaped char, it will return the its
+// associated raw-char. Otherwise, the function will simply return
+// the original input char.
+char UnescapeChar(const char c);
+
+// If the input char is a control char, it will return the its
+// associated escaped char. Otherwise, the function will simply return
+// the original input char.
+char EscapeChar(const char c);
+
+// Converts a raw string to an escaped string. Escaped-characters are
+// defined via the isSpecialChar() function. When a char in the input
+// string "raw_string" is classified as a special characters, then it
+// will be prefixed by '\' in the output.
+//
+// It's inverse function is UnescapeOptionString().
+// @param raw_string the input string
+// @return the '\' escaped string of the input "raw_string"
+// @see isSpecialChar, UnescapeOptionString
+std::string EscapeOptionString(const std::string& raw_string);
+
+// The inverse function of EscapeOptionString. It converts
+// an '\' escaped string back to a raw string.
+//
+// @param escaped_string the input '\' escaped string
+// @return the raw string of the input "escaped_string"
+std::string UnescapeOptionString(const std::string& escaped_string);
+
+std::string trim(const std::string& str);
+
+#ifndef ROCKSDB_LITE
+bool ParseBoolean(const std::string& type, const std::string& value);
+
+uint32_t ParseUint32(const std::string& value);
+
+int32_t ParseInt32(const std::string& value);
+#endif
+
+uint64_t ParseUint64(const std::string& value);
+
+int ParseInt(const std::string& value);
+
+int64_t ParseInt64(const std::string& value);
+
+double ParseDouble(const std::string& value);
+
+size_t ParseSizeT(const std::string& value);
+
+std::vector<int> ParseVectorInt(const std::string& value);
+
+bool SerializeIntVector(const std::vector<int>& vec, std::string* value);
+
+extern const std::string kNullptrString;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/thread_list_test.cc b/src/rocksdb/util/thread_list_test.cc
new file mode 100644
index 000000000..44c3ebc99
--- /dev/null
+++ b/src/rocksdb/util/thread_list_test.cc
@@ -0,0 +1,352 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <mutex>
+#include <condition_variable>
+
+#include "monitoring/thread_status_updater.h"
+#include "rocksdb/db.h"
+#include "test_util/testharness.h"
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimulatedBackgroundTask {
+ public:
+ SimulatedBackgroundTask(
+ const void* db_key, const std::string& db_name,
+ const void* cf_key, const std::string& cf_name,
+ const ThreadStatus::OperationType operation_type =
+ ThreadStatus::OP_UNKNOWN,
+ const ThreadStatus::StateType state_type =
+ ThreadStatus::STATE_UNKNOWN)
+ : db_key_(db_key), db_name_(db_name),
+ cf_key_(cf_key), cf_name_(cf_name),
+ operation_type_(operation_type), state_type_(state_type),
+ should_run_(true), running_count_(0) {
+ Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo(
+ db_key_, db_name_, cf_key_, cf_name_);
+ }
+
+ ~SimulatedBackgroundTask() {
+ Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_);
+ }
+
+ void Run() {
+ std::unique_lock<std::mutex> l(mutex_);
+ running_count_++;
+ Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
+ Env::Default()->GetThreadStatusUpdater()->SetThreadOperation(
+ operation_type_);
+ Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_);
+ while (should_run_) {
+ bg_cv_.wait(l);
+ }
+ Env::Default()->GetThreadStatusUpdater()->ClearThreadState();
+ Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation();
+ Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(nullptr);
+ running_count_--;
+ bg_cv_.notify_all();
+ }
+
+ void FinishAllTasks() {
+ std::unique_lock<std::mutex> l(mutex_);
+ should_run_ = false;
+ bg_cv_.notify_all();
+ }
+
+ void WaitUntilScheduled(int job_count, Env* env) {
+ while (running_count_ < job_count) {
+ env->SleepForMicroseconds(1000);
+ }
+ }
+
+ void WaitUntilDone() {
+ std::unique_lock<std::mutex> l(mutex_);
+ while (running_count_ > 0) {
+ bg_cv_.wait(l);
+ }
+ }
+
+ static void DoSimulatedTask(void* arg) {
+ reinterpret_cast<SimulatedBackgroundTask*>(arg)->Run();
+ }
+
+ private:
+ const void* db_key_;
+ const std::string db_name_;
+ const void* cf_key_;
+ const std::string cf_name_;
+ const ThreadStatus::OperationType operation_type_;
+ const ThreadStatus::StateType state_type_;
+ std::mutex mutex_;
+ std::condition_variable bg_cv_;
+ bool should_run_;
+ std::atomic<int> running_count_;
+};
+
+class ThreadListTest : public testing::Test {
+ public:
+ ThreadListTest() {
+ }
+};
+
+TEST_F(ThreadListTest, GlobalTables) {
+ // verify the global tables for operations and states are properly indexed.
+ for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) {
+ ASSERT_EQ(global_operation_table[type].type, type);
+ ASSERT_EQ(global_operation_table[type].name,
+ ThreadStatus::GetOperationName(
+ ThreadStatus::OperationType(type)));
+ }
+
+ for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) {
+ ASSERT_EQ(global_state_table[type].type, type);
+ ASSERT_EQ(global_state_table[type].name,
+ ThreadStatus::GetStateName(
+ ThreadStatus::StateType(type)));
+ }
+
+ for (int stage = 0; stage != ThreadStatus::NUM_OP_STAGES; ++stage) {
+ ASSERT_EQ(global_op_stage_table[stage].stage, stage);
+ ASSERT_EQ(global_op_stage_table[stage].name,
+ ThreadStatus::GetOperationStageName(
+ ThreadStatus::OperationStage(stage)));
+ }
+}
+
+TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) {
+ Env* env = Env::Default();
+ const int kHighPriorityThreads = 3;
+ const int kLowPriorityThreads = 5;
+ const int kSimulatedHighPriThreads = kHighPriorityThreads - 1;
+ const int kSimulatedLowPriThreads = kLowPriorityThreads / 3;
+ env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH);
+ env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW);
+
+ SimulatedBackgroundTask running_task(
+ reinterpret_cast<void*>(1234), "running",
+ reinterpret_cast<void*>(5678), "pikachu");
+
+ for (int test = 0; test < kSimulatedHighPriThreads; ++test) {
+ env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+ &running_task, Env::Priority::HIGH);
+ }
+ for (int test = 0; test < kSimulatedLowPriThreads; ++test) {
+ env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+ &running_task, Env::Priority::LOW);
+ }
+ running_task.WaitUntilScheduled(
+ kSimulatedHighPriThreads + kSimulatedLowPriThreads, env);
+
+ std::vector<ThreadStatus> thread_list;
+
+ // Verify the number of running threads in each pool.
+ env->GetThreadList(&thread_list);
+ int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0};
+ for (auto thread_status : thread_list) {
+ if (thread_status.cf_name == "pikachu" &&
+ thread_status.db_name == "running") {
+ running_count[thread_status.thread_type]++;
+ }
+ }
+ ASSERT_EQ(
+ running_count[ThreadStatus::HIGH_PRIORITY],
+ kSimulatedHighPriThreads);
+ ASSERT_EQ(
+ running_count[ThreadStatus::LOW_PRIORITY],
+ kSimulatedLowPriThreads);
+ ASSERT_EQ(
+ running_count[ThreadStatus::USER], 0);
+
+ running_task.FinishAllTasks();
+ running_task.WaitUntilDone();
+
+ // Verify none of the threads are running
+ env->GetThreadList(&thread_list);
+
+ for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) {
+ running_count[i] = 0;
+ }
+ for (auto thread_status : thread_list) {
+ if (thread_status.cf_name == "pikachu" &&
+ thread_status.db_name == "running") {
+ running_count[thread_status.thread_type]++;
+ }
+ }
+
+ ASSERT_EQ(
+ running_count[ThreadStatus::HIGH_PRIORITY], 0);
+ ASSERT_EQ(
+ running_count[ThreadStatus::LOW_PRIORITY], 0);
+ ASSERT_EQ(
+ running_count[ThreadStatus::USER], 0);
+}
+
+namespace {
+ void UpdateStatusCounts(
+ const std::vector<ThreadStatus>& thread_list,
+ int operation_counts[], int state_counts[]) {
+ for (auto thread_status : thread_list) {
+ operation_counts[thread_status.operation_type]++;
+ state_counts[thread_status.state_type]++;
+ }
+ }
+
+ void VerifyAndResetCounts(
+ const int correct_counts[], int collected_counts[], int size) {
+ for (int i = 0; i < size; ++i) {
+ ASSERT_EQ(collected_counts[i], correct_counts[i]);
+ collected_counts[i] = 0;
+ }
+ }
+
+ void UpdateCount(
+ int operation_counts[], int from_event, int to_event, int amount) {
+ operation_counts[from_event] -= amount;
+ operation_counts[to_event] += amount;
+ }
+} // namespace
+
+TEST_F(ThreadListTest, SimpleEventTest) {
+ Env* env = Env::Default();
+
+ // simulated tasks
+ const int kFlushWriteTasks = 3;
+ SimulatedBackgroundTask flush_write_task(
+ reinterpret_cast<void*>(1234), "running",
+ reinterpret_cast<void*>(5678), "pikachu",
+ ThreadStatus::OP_FLUSH);
+
+ const int kCompactionWriteTasks = 4;
+ SimulatedBackgroundTask compaction_write_task(
+ reinterpret_cast<void*>(1234), "running",
+ reinterpret_cast<void*>(5678), "pikachu",
+ ThreadStatus::OP_COMPACTION);
+
+ const int kCompactionReadTasks = 5;
+ SimulatedBackgroundTask compaction_read_task(
+ reinterpret_cast<void*>(1234), "running",
+ reinterpret_cast<void*>(5678), "pikachu",
+ ThreadStatus::OP_COMPACTION);
+
+ const int kCompactionWaitTasks = 6;
+ SimulatedBackgroundTask compaction_wait_task(
+ reinterpret_cast<void*>(1234), "running",
+ reinterpret_cast<void*>(5678), "pikachu",
+ ThreadStatus::OP_COMPACTION);
+
+ // setup right answers
+ int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+ correct_operation_counts[ThreadStatus::OP_FLUSH] =
+ kFlushWriteTasks;
+ correct_operation_counts[ThreadStatus::OP_COMPACTION] =
+ kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks;
+
+ env->SetBackgroundThreads(
+ correct_operation_counts[ThreadStatus::OP_FLUSH], Env::HIGH);
+ env->SetBackgroundThreads(
+ correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW);
+
+ // schedule the simulated tasks
+ for (int t = 0; t < kFlushWriteTasks; ++t) {
+ env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+ &flush_write_task, Env::Priority::HIGH);
+ }
+ flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env);
+
+ for (int t = 0; t < kCompactionWriteTasks; ++t) {
+ env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+ &compaction_write_task, Env::Priority::LOW);
+ }
+ compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env);
+
+ for (int t = 0; t < kCompactionReadTasks; ++t) {
+ env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+ &compaction_read_task, Env::Priority::LOW);
+ }
+ compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env);
+
+ for (int t = 0; t < kCompactionWaitTasks; ++t) {
+ env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+ &compaction_wait_task, Env::Priority::LOW);
+ }
+ compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env);
+
+ // verify the thread-status
+ int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+ int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0};
+
+ std::vector<ThreadStatus> thread_list;
+ env->GetThreadList(&thread_list);
+ UpdateStatusCounts(thread_list, operation_counts, state_counts);
+ VerifyAndResetCounts(correct_operation_counts, operation_counts,
+ ThreadStatus::NUM_OP_TYPES);
+
+ // terminate compaction-wait tasks and see if the thread-status
+ // reflects this update
+ compaction_wait_task.FinishAllTasks();
+ compaction_wait_task.WaitUntilDone();
+ UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+ ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks);
+
+ env->GetThreadList(&thread_list);
+ UpdateStatusCounts(thread_list, operation_counts, state_counts);
+ VerifyAndResetCounts(correct_operation_counts, operation_counts,
+ ThreadStatus::NUM_OP_TYPES);
+
+ // terminate flush-write tasks and see if the thread-status
+ // reflects this update
+ flush_write_task.FinishAllTasks();
+ flush_write_task.WaitUntilDone();
+ UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH,
+ ThreadStatus::OP_UNKNOWN, kFlushWriteTasks);
+
+ env->GetThreadList(&thread_list);
+ UpdateStatusCounts(thread_list, operation_counts, state_counts);
+ VerifyAndResetCounts(correct_operation_counts, operation_counts,
+ ThreadStatus::NUM_OP_TYPES);
+
+ // terminate compaction-write tasks and see if the thread-status
+ // reflects this update
+ compaction_write_task.FinishAllTasks();
+ compaction_write_task.WaitUntilDone();
+ UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+ ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks);
+
+ env->GetThreadList(&thread_list);
+ UpdateStatusCounts(thread_list, operation_counts, state_counts);
+ VerifyAndResetCounts(correct_operation_counts, operation_counts,
+ ThreadStatus::NUM_OP_TYPES);
+
+ // terminate compaction-write tasks and see if the thread-status
+ // reflects this update
+ compaction_read_task.FinishAllTasks();
+ compaction_read_task.WaitUntilDone();
+ UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+ ThreadStatus::OP_UNKNOWN, kCompactionReadTasks);
+
+ env->GetThreadList(&thread_list);
+ UpdateStatusCounts(thread_list, operation_counts, state_counts);
+ VerifyAndResetCounts(correct_operation_counts, operation_counts,
+ ThreadStatus::NUM_OP_TYPES);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return 0;
+}
+
+#endif // ROCKSDB_USING_THREAD_STATUS
diff --git a/src/rocksdb/util/thread_local.cc b/src/rocksdb/util/thread_local.cc
new file mode 100644
index 000000000..9fa16f609
--- /dev/null
+++ b/src/rocksdb/util/thread_local.cc
@@ -0,0 +1,554 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/thread_local.h"
+#include "util/mutexlock.h"
+#include "port/likely.h"
+#include <stdlib.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+struct Entry {
+ Entry() : ptr(nullptr) {}
+ Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {}
+ std::atomic<void*> ptr;
+};
+
+class StaticMeta;
+
+// This is the structure that is declared as "thread_local" storage.
+// The vector keep list of atomic pointer for all instances for "current"
+// thread. The vector is indexed by an Id that is unique in process and
+// associated with one ThreadLocalPtr instance. The Id is assigned by a
+// global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr
+// instances, each thread will have a ThreadData with a vector of size 3:
+// ---------------------------------------------------
+// | | instance 1 | instance 2 | instnace 3 |
+// ---------------------------------------------------
+// | thread 1 | void* | void* | void* | <- ThreadData
+// ---------------------------------------------------
+// | thread 2 | void* | void* | void* | <- ThreadData
+// ---------------------------------------------------
+// | thread 3 | void* | void* | void* | <- ThreadData
+// ---------------------------------------------------
+struct ThreadData {
+ explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst)
+ : entries(),
+ next(nullptr),
+ prev(nullptr),
+ inst(_inst) {}
+ std::vector<Entry> entries;
+ ThreadData* next;
+ ThreadData* prev;
+ ThreadLocalPtr::StaticMeta* inst;
+};
+
+class ThreadLocalPtr::StaticMeta {
+public:
+ StaticMeta();
+
+ // Return the next available Id
+ uint32_t GetId();
+ // Return the next available Id without claiming it
+ uint32_t PeekId() const;
+ // Return the given Id back to the free pool. This also triggers
+ // UnrefHandler for associated pointer value (if not NULL) for all threads.
+ void ReclaimId(uint32_t id);
+
+ // Return the pointer value for the given id for the current thread.
+ void* Get(uint32_t id) const;
+ // Reset the pointer value for the given id for the current thread.
+ void Reset(uint32_t id, void* ptr);
+ // Atomically swap the supplied ptr and return the previous value
+ void* Swap(uint32_t id, void* ptr);
+ // Atomically compare and swap the provided value only if it equals
+ // to expected value.
+ bool CompareAndSwap(uint32_t id, void* ptr, void*& expected);
+ // Reset all thread local data to replacement, and return non-nullptr
+ // data for all existing threads
+ void Scrape(uint32_t id, autovector<void*>* ptrs, void* const replacement);
+ // Update res by applying func on each thread-local value. Holds a lock that
+ // prevents unref handler from running during this call, but clients must
+ // still provide external synchronization since the owning thread can
+ // access the values without internal locking, e.g., via Get() and Reset().
+ void Fold(uint32_t id, FoldFunc func, void* res);
+
+ // Register the UnrefHandler for id
+ void SetHandler(uint32_t id, UnrefHandler handler);
+
+ // protect inst, next_instance_id_, free_instance_ids_, head_,
+ // ThreadData.entries
+ //
+ // Note that here we prefer function static variable instead of the usual
+ // global static variable. The reason is that c++ destruction order of
+ // static variables in the reverse order of their construction order.
+ // However, C++ does not guarantee any construction order when global
+ // static variables are defined in different files, while the function
+ // static variables are initialized when their function are first called.
+ // As a result, the construction order of the function static variables
+ // can be controlled by properly invoke their first function calls in
+ // the right order.
+ //
+ // For instance, the following function contains a function static
+ // variable. We place a dummy function call of this inside
+ // Env::Default() to ensure the construction order of the construction
+ // order.
+ static port::Mutex* Mutex();
+
+ // Returns the member mutex of the current StaticMeta. In general,
+ // Mutex() should be used instead of this one. However, in case where
+ // the static variable inside Instance() goes out of scope, MemberMutex()
+ // should be used. One example is OnThreadExit() function.
+ port::Mutex* MemberMutex() { return &mutex_; }
+
+private:
+ // Get UnrefHandler for id with acquiring mutex
+ // REQUIRES: mutex locked
+ UnrefHandler GetHandler(uint32_t id);
+
+ // Triggered before a thread terminates
+ static void OnThreadExit(void* ptr);
+
+ // Add current thread's ThreadData to the global chain
+ // REQUIRES: mutex locked
+ void AddThreadData(ThreadData* d);
+
+ // Remove current thread's ThreadData from the global chain
+ // REQUIRES: mutex locked
+ void RemoveThreadData(ThreadData* d);
+
+ static ThreadData* GetThreadLocal();
+
+ uint32_t next_instance_id_;
+ // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed
+ // frequently. This also prevents it from blowing up the vector space.
+ autovector<uint32_t> free_instance_ids_;
+ // Chain all thread local structure together. This is necessary since
+ // when one ThreadLocalPtr gets destroyed, we need to loop over each
+ // thread's version of pointer corresponding to that instance and
+ // call UnrefHandler for it.
+ ThreadData head_;
+
+ std::unordered_map<uint32_t, UnrefHandler> handler_map_;
+
+ // The private mutex. Developers should always use Mutex() instead of
+ // using this variable directly.
+ port::Mutex mutex_;
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+ // Thread local storage
+ static __thread ThreadData* tls_;
+#endif
+
+ // Used to make thread exit trigger possible if !defined(OS_MACOSX).
+ // Otherwise, used to retrieve thread data.
+ pthread_key_t pthread_key_;
+};
+
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+__thread ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr;
+#endif
+
+// Windows doesn't support a per-thread destructor with its
+// TLS primitives. So, we build it manually by inserting a
+// function to be called on each thread's exit.
+// See http://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
+// and http://www.nynaeve.net/?p=183
+//
+// really we do this to have clear conscience since using TLS with thread-pools
+// is iffy
+// although OK within a request. But otherwise, threads have no identity in its
+// modern use.
+
+// This runs on windows only called from the System Loader
+#ifdef OS_WIN
+
+// Windows cleanup routine is invoked from a System Loader with a different
+// signature so we can not directly hookup the original OnThreadExit which is
+// private member
+// so we make StaticMeta class share with the us the address of the function so
+// we can invoke it.
+namespace wintlscleanup {
+
+// This is set to OnThreadExit in StaticMeta singleton constructor
+UnrefHandler thread_local_inclass_routine = nullptr;
+pthread_key_t thread_local_key = pthread_key_t (-1);
+
+// Static callback function to call with each thread termination.
+void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
+ // We decided to punt on PROCESS_EXIT
+ if (DLL_THREAD_DETACH == reason) {
+ if (thread_local_key != pthread_key_t(-1) &&
+ thread_local_inclass_routine != nullptr) {
+ void* tls = TlsGetValue(thread_local_key);
+ if (tls != nullptr) {
+ thread_local_inclass_routine(tls);
+ }
+ }
+ }
+}
+
+} // wintlscleanup
+
+// extern "C" suppresses C++ name mangling so we know the symbol name for the
+// linker /INCLUDE:symbol pragma above.
+extern "C" {
+
+#ifdef _MSC_VER
+// The linker must not discard thread_callback_on_exit. (We force a reference
+// to this variable with a linker /include:symbol pragma to ensure that.) If
+// this variable is discarded, the OnThreadExit function will never be called.
+#ifndef _X86_
+
+// .CRT section is merged with .rdata on x64 so it must be constant data.
+#pragma const_seg(".CRT$XLB")
+// When defining a const variable, it must have external linkage to be sure the
+// linker doesn't discard it.
+extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit;
+const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit =
+ wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma const_seg()
+
+#pragma comment(linker, "/include:_tls_used")
+#pragma comment(linker, "/include:p_thread_callback_on_exit")
+
+#else // _X86_
+
+#pragma data_seg(".CRT$XLB")
+PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma data_seg()
+
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit")
+
+#endif // _X86_
+
+#else
+// https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc
+BOOL WINAPI DllMain(HINSTANCE h, DWORD dwReason, PVOID pv) {
+ if (dwReason == DLL_THREAD_DETACH)
+ wintlscleanup::WinOnThreadExit(h, dwReason, pv);
+ return TRUE;
+}
+#endif
+} // extern "C"
+
+#endif // OS_WIN
+
+void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::Instance(); }
+
+ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() {
+ // Here we prefer function static variable instead of global
+ // static variable as function static variable is initialized
+ // when the function is first call. As a result, we can properly
+ // control their construction order by properly preparing their
+ // first function call.
+ //
+ // Note that here we decide to make "inst" a static pointer w/o deleting
+ // it at the end instead of a static variable. This is to avoid the following
+ // destruction order disaster happens when a child thread using ThreadLocalPtr
+ // dies AFTER the main thread dies: When a child thread happens to use
+ // ThreadLocalPtr, it will try to delete its thread-local data on its
+ // OnThreadExit when the child thread dies. However, OnThreadExit depends
+ // on the following variable. As a result, if the main thread dies before any
+ // child thread happen to use ThreadLocalPtr dies, then the destruction of
+ // the following variable will go first, then OnThreadExit, therefore causing
+ // invalid access.
+ //
+ // The above problem can be solved by using thread_local to store tls_ instead
+ // of using __thread. The major difference between thread_local and __thread
+ // is that thread_local supports dynamic construction and destruction of
+ // non-primitive typed variables. As a result, we can guarantee the
+ // destruction order even when the main thread dies before any child threads.
+ // However, thread_local is not supported in all compilers that accept -std=c++11
+ // (e.g., eg Mac with XCode < 8. XCode 8+ supports thread_local).
+ static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta();
+ return inst;
+}
+
+port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; }
+
+void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
+ auto* tls = static_cast<ThreadData*>(ptr);
+ assert(tls != nullptr);
+
+ // Use the cached StaticMeta::Instance() instead of directly calling
+ // the variable inside StaticMeta::Instance() might already go out of
+ // scope here in case this OnThreadExit is called after the main thread
+ // dies.
+ auto* inst = tls->inst;
+ pthread_setspecific(inst->pthread_key_, nullptr);
+
+ MutexLock l(inst->MemberMutex());
+ inst->RemoveThreadData(tls);
+ // Unref stored pointers of current thread from all instances
+ uint32_t id = 0;
+ for (auto& e : tls->entries) {
+ void* raw = e.ptr.load();
+ if (raw != nullptr) {
+ auto unref = inst->GetHandler(id);
+ if (unref != nullptr) {
+ unref(raw);
+ }
+ }
+ ++id;
+ }
+ // Delete thread local structure no matter if it is Mac platform
+ delete tls;
+}
+
+ThreadLocalPtr::StaticMeta::StaticMeta()
+ : next_instance_id_(0),
+ head_(this),
+ pthread_key_(0) {
+ if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
+ abort();
+ }
+
+ // OnThreadExit is not getting called on the main thread.
+ // Call through the static destructor mechanism to avoid memory leak.
+ //
+ // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global
+ // singleton (destructors are invoked in reverse order of constructor
+ // _completion_); the latter must not mutate internal members. This
+ // cleanup mechanism inherently relies on use-after-release of the
+ // StaticMeta, and is brittle with respect to compiler-specific handling
+ // of memory backing destructed statically-scoped objects. Perhaps
+ // registering with atexit(3) would be more robust.
+ //
+// This is not required on Windows.
+#if !defined(OS_WIN)
+ static struct A {
+ ~A() {
+#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL
+ ThreadData* tls_ =
+ static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_));
+#endif
+ if (tls_) {
+ OnThreadExit(tls_);
+ }
+ }
+ } a;
+#endif // !defined(OS_WIN)
+
+ head_.next = &head_;
+ head_.prev = &head_;
+
+#ifdef OS_WIN
+ // Share with Windows its cleanup routine and the key
+ wintlscleanup::thread_local_inclass_routine = OnThreadExit;
+ wintlscleanup::thread_local_key = pthread_key_;
+#endif
+}
+
+void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) {
+ Mutex()->AssertHeld();
+ d->next = &head_;
+ d->prev = head_.prev;
+ head_.prev->next = d;
+ head_.prev = d;
+}
+
+void ThreadLocalPtr::StaticMeta::RemoveThreadData(
+ ThreadData* d) {
+ Mutex()->AssertHeld();
+ d->next->prev = d->prev;
+ d->prev->next = d->next;
+ d->next = d->prev = d;
+}
+
+ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
+#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL
+ // Make this local variable name look like a member variable so that we
+ // can share all the code below
+ ThreadData* tls_ =
+ static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_));
+#endif
+
+ if (UNLIKELY(tls_ == nullptr)) {
+ auto* inst = Instance();
+ tls_ = new ThreadData(inst);
+ {
+ // Register it in the global chain, needs to be done before thread exit
+ // handler registration
+ MutexLock l(Mutex());
+ inst->AddThreadData(tls_);
+ }
+ // Even it is not OS_MACOSX, need to register value for pthread_key_ so that
+ // its exit handler will be triggered.
+ if (pthread_setspecific(inst->pthread_key_, tls_) != 0) {
+ {
+ MutexLock l(Mutex());
+ inst->RemoveThreadData(tls_);
+ }
+ delete tls_;
+ abort();
+ }
+ }
+ return tls_;
+}
+
+void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const {
+ auto* tls = GetThreadLocal();
+ if (UNLIKELY(id >= tls->entries.size())) {
+ return nullptr;
+ }
+ return tls->entries[id].ptr.load(std::memory_order_acquire);
+}
+
+void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
+ auto* tls = GetThreadLocal();
+ if (UNLIKELY(id >= tls->entries.size())) {
+ // Need mutex to protect entries access within ReclaimId
+ MutexLock l(Mutex());
+ tls->entries.resize(id + 1);
+ }
+ tls->entries[id].ptr.store(ptr, std::memory_order_release);
+}
+
+void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
+ auto* tls = GetThreadLocal();
+ if (UNLIKELY(id >= tls->entries.size())) {
+ // Need mutex to protect entries access within ReclaimId
+ MutexLock l(Mutex());
+ tls->entries.resize(id + 1);
+ }
+ return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire);
+}
+
+bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
+ void*& expected) {
+ auto* tls = GetThreadLocal();
+ if (UNLIKELY(id >= tls->entries.size())) {
+ // Need mutex to protect entries access within ReclaimId
+ MutexLock l(Mutex());
+ tls->entries.resize(id + 1);
+ }
+ return tls->entries[id].ptr.compare_exchange_strong(
+ expected, ptr, std::memory_order_release, std::memory_order_relaxed);
+}
+
+void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
+ void* const replacement) {
+ MutexLock l(Mutex());
+ for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+ if (id < t->entries.size()) {
+ void* ptr =
+ t->entries[id].ptr.exchange(replacement, std::memory_order_acquire);
+ if (ptr != nullptr) {
+ ptrs->push_back(ptr);
+ }
+ }
+ }
+}
+
+void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) {
+ MutexLock l(Mutex());
+ for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+ if (id < t->entries.size()) {
+ void* ptr = t->entries[id].ptr.load();
+ if (ptr != nullptr) {
+ func(ptr, res);
+ }
+ }
+ }
+}
+
+uint32_t ThreadLocalPtr::TEST_PeekId() {
+ return Instance()->PeekId();
+}
+
+void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) {
+ MutexLock l(Mutex());
+ handler_map_[id] = handler;
+}
+
+UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) {
+ Mutex()->AssertHeld();
+ auto iter = handler_map_.find(id);
+ if (iter == handler_map_.end()) {
+ return nullptr;
+ }
+ return iter->second;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::GetId() {
+ MutexLock l(Mutex());
+ if (free_instance_ids_.empty()) {
+ return next_instance_id_++;
+ }
+
+ uint32_t id = free_instance_ids_.back();
+ free_instance_ids_.pop_back();
+ return id;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::PeekId() const {
+ MutexLock l(Mutex());
+ if (!free_instance_ids_.empty()) {
+ return free_instance_ids_.back();
+ }
+ return next_instance_id_;
+}
+
+void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) {
+ // This id is not used, go through all thread local data and release
+ // corresponding value
+ MutexLock l(Mutex());
+ auto unref = GetHandler(id);
+ for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+ if (id < t->entries.size()) {
+ void* ptr = t->entries[id].ptr.exchange(nullptr);
+ if (ptr != nullptr && unref != nullptr) {
+ unref(ptr);
+ }
+ }
+ }
+ handler_map_[id] = nullptr;
+ free_instance_ids_.push_back(id);
+}
+
+ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler)
+ : id_(Instance()->GetId()) {
+ if (handler != nullptr) {
+ Instance()->SetHandler(id_, handler);
+ }
+}
+
+ThreadLocalPtr::~ThreadLocalPtr() {
+ Instance()->ReclaimId(id_);
+}
+
+void* ThreadLocalPtr::Get() const {
+ return Instance()->Get(id_);
+}
+
+void ThreadLocalPtr::Reset(void* ptr) {
+ Instance()->Reset(id_, ptr);
+}
+
+void* ThreadLocalPtr::Swap(void* ptr) {
+ return Instance()->Swap(id_, ptr);
+}
+
+bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) {
+ return Instance()->CompareAndSwap(id_, ptr, expected);
+}
+
+void ThreadLocalPtr::Scrape(autovector<void*>* ptrs, void* const replacement) {
+ Instance()->Scrape(id_, ptrs, replacement);
+}
+
+void ThreadLocalPtr::Fold(FoldFunc func, void* res) {
+ Instance()->Fold(id_, func, res);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/thread_local.h b/src/rocksdb/util/thread_local.h
new file mode 100644
index 000000000..c4b762ab6
--- /dev/null
+++ b/src/rocksdb/util/thread_local.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "util/autovector.h"
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Cleanup function that will be called for a stored thread local
+// pointer (if not NULL) when one of the following happens:
+// (1) a thread terminates
+// (2) a ThreadLocalPtr is destroyed
+//
+// Warning: this function is called while holding a global mutex. The same mutex
+// is used (at least in some cases) by most methods of ThreadLocalPtr, and it's
+// shared across all instances of ThreadLocalPtr. Thereforere extra care
+// is needed to avoid deadlocks. In particular, the handler shouldn't lock any
+// mutexes and shouldn't call any methods of any ThreadLocalPtr instances,
+// unless you know what you're doing.
+typedef void (*UnrefHandler)(void* ptr);
+
+// ThreadLocalPtr stores only values of pointer type. Different from
+// the usual thread-local-storage, ThreadLocalPtr has the ability to
+// distinguish data coming from different threads and different
+// ThreadLocalPtr instances. For example, if a regular thread_local
+// variable A is declared in DBImpl, two DBImpl objects would share
+// the same A. However, a ThreadLocalPtr that is defined under the
+// scope of DBImpl can avoid such confliction. As a result, its memory
+// usage would be O(# of threads * # of ThreadLocalPtr instances).
+class ThreadLocalPtr {
+ public:
+ explicit ThreadLocalPtr(UnrefHandler handler = nullptr);
+
+ ThreadLocalPtr(const ThreadLocalPtr&) = delete;
+ ThreadLocalPtr& operator=(const ThreadLocalPtr&) = delete;
+
+ ~ThreadLocalPtr();
+
+ // Return the current pointer stored in thread local
+ void* Get() const;
+
+ // Set a new pointer value to the thread local storage.
+ void Reset(void* ptr);
+
+ // Atomically swap the supplied ptr and return the previous value
+ void* Swap(void* ptr);
+
+ // Atomically compare the stored value with expected. Set the new
+ // pointer value to thread local only if the comparison is true.
+ // Otherwise, expected returns the stored value.
+ // Return true on success, false on failure
+ bool CompareAndSwap(void* ptr, void*& expected);
+
+ // Reset all thread local data to replacement, and return non-nullptr
+ // data for all existing threads
+ void Scrape(autovector<void*>* ptrs, void* const replacement);
+
+ typedef std::function<void(void*, void*)> FoldFunc;
+ // Update res by applying func on each thread-local value. Holds a lock that
+ // prevents unref handler from running during this call, but clients must
+ // still provide external synchronization since the owning thread can
+ // access the values without internal locking, e.g., via Get() and Reset().
+ void Fold(FoldFunc func, void* res);
+
+ // Add here for testing
+ // Return the next available Id without claiming it
+ static uint32_t TEST_PeekId();
+
+ // Initialize the static singletons of the ThreadLocalPtr.
+ //
+ // If this function is not called, then the singletons will be
+ // automatically initialized when they are used.
+ //
+ // Calling this function twice or after the singletons have been
+ // initialized will be no-op.
+ static void InitSingletons();
+
+ class StaticMeta;
+
+private:
+
+ static StaticMeta* Instance();
+
+ const uint32_t id_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/thread_local_test.cc b/src/rocksdb/util/thread_local_test.cc
new file mode 100644
index 000000000..b788b1773
--- /dev/null
+++ b/src/rocksdb/util/thread_local_test.cc
@@ -0,0 +1,580 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <thread>
+#include <atomic>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/autovector.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ThreadLocalTest : public testing::Test {
+ public:
+ ThreadLocalTest() : env_(Env::Default()) {}
+
+ Env* env_;
+};
+
+namespace {
+
+struct Params {
+ Params(port::Mutex* m, port::CondVar* c, int* u, int n,
+ UnrefHandler handler = nullptr)
+ : mu(m),
+ cv(c),
+ unref(u),
+ total(n),
+ started(0),
+ completed(0),
+ doWrite(false),
+ tls1(handler),
+ tls2(nullptr) {}
+
+ port::Mutex* mu;
+ port::CondVar* cv;
+ int* unref;
+ int total;
+ int started;
+ int completed;
+ bool doWrite;
+ ThreadLocalPtr tls1;
+ ThreadLocalPtr* tls2;
+};
+
+class IDChecker : public ThreadLocalPtr {
+public:
+ static uint32_t PeekId() {
+ return TEST_PeekId();
+ }
+};
+
+} // anonymous namespace
+
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+TEST_F(ThreadLocalTest, UniqueIdTest) {
+ port::Mutex mu;
+ port::CondVar cv(&mu);
+
+ uint32_t base_id = IDChecker::PeekId();
+ // New ThreadLocal instance bumps id by 1
+ {
+ // Id used 0
+ Params p1(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+ // Id used 1
+ Params p2(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 2u);
+ // Id used 2
+ Params p3(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+ // Id used 3
+ Params p4(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 4u);
+ }
+ // id 3, 2, 1, 0 are in the free queue in order
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 0u);
+
+ // pick up 0
+ Params p1(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+ // pick up 1
+ Params* p2 = new Params(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 2u);
+ // pick up 2
+ Params p3(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+ // return up 1
+ delete p2;
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+ // Now we have 3, 1 in queue
+ // pick up 1
+ Params p4(&mu, &cv, nullptr, 1u);
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+ // pick up 3
+ Params p5(&mu, &cv, nullptr, 1u);
+ // next new id
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 4u);
+ // After exit, id sequence in queue:
+ // 3, 1, 2, 0
+}
+#endif // __clang_analyzer__
+
+TEST_F(ThreadLocalTest, SequentialReadWriteTest) {
+ // global id list carries over 3, 1, 2, 0
+ uint32_t base_id = IDChecker::PeekId();
+
+ port::Mutex mu;
+ port::CondVar cv(&mu);
+ Params p(&mu, &cv, nullptr, 1);
+ ThreadLocalPtr tls2;
+ p.tls2 = &tls2;
+
+ auto func = [](void* ptr) {
+ auto& params = *static_cast<Params*>(ptr);
+
+ ASSERT_TRUE(params.tls1.Get() == nullptr);
+ params.tls1.Reset(reinterpret_cast<int*>(1));
+ ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1));
+ params.tls1.Reset(reinterpret_cast<int*>(2));
+ ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(2));
+
+ ASSERT_TRUE(params.tls2->Get() == nullptr);
+ params.tls2->Reset(reinterpret_cast<int*>(1));
+ ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(1));
+ params.tls2->Reset(reinterpret_cast<int*>(2));
+ ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(2));
+
+ params.mu->Lock();
+ ++(params.completed);
+ params.cv->SignalAll();
+ params.mu->Unlock();
+ };
+
+ for (int iter = 0; iter < 1024; ++iter) {
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+ // Another new thread, read/write should not see value from previous thread
+ env_->StartThread(func, static_cast<void*>(&p));
+ mu.Lock();
+ while (p.completed != iter + 1) {
+ cv.Wait();
+ }
+ mu.Unlock();
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+ }
+}
+
+TEST_F(ThreadLocalTest, ConcurrentReadWriteTest) {
+ // global id list carries over 3, 1, 2, 0
+ uint32_t base_id = IDChecker::PeekId();
+
+ ThreadLocalPtr tls2;
+ port::Mutex mu1;
+ port::CondVar cv1(&mu1);
+ Params p1(&mu1, &cv1, nullptr, 16);
+ p1.tls2 = &tls2;
+
+ port::Mutex mu2;
+ port::CondVar cv2(&mu2);
+ Params p2(&mu2, &cv2, nullptr, 16);
+ p2.doWrite = true;
+ p2.tls2 = &tls2;
+
+ auto func = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+
+ p.mu->Lock();
+ // Size_T switches size along with the ptr size
+ // we want to cast to.
+ size_t own = ++(p.started);
+ p.cv->SignalAll();
+ while (p.started != p.total) {
+ p.cv->Wait();
+ }
+ p.mu->Unlock();
+
+ // Let write threads write a different value from the read threads
+ if (p.doWrite) {
+ own += 8192;
+ }
+
+ ASSERT_TRUE(p.tls1.Get() == nullptr);
+ ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+ auto* env = Env::Default();
+ auto start = env->NowMicros();
+
+ p.tls1.Reset(reinterpret_cast<size_t*>(own));
+ p.tls2->Reset(reinterpret_cast<size_t*>(own + 1));
+ // Loop for 1 second
+ while (env->NowMicros() - start < 1000 * 1000) {
+ for (int iter = 0; iter < 100000; ++iter) {
+ ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<size_t*>(own));
+ ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<size_t*>(own + 1));
+ if (p.doWrite) {
+ p.tls1.Reset(reinterpret_cast<size_t*>(own));
+ p.tls2->Reset(reinterpret_cast<size_t*>(own + 1));
+ }
+ }
+ }
+
+ p.mu->Lock();
+ ++(p.completed);
+ p.cv->SignalAll();
+ p.mu->Unlock();
+ };
+
+ // Initiate 2 instnaces: one keeps writing and one keeps reading.
+ // The read instance should not see data from the write instance.
+ // Each thread local copy of the value are also different from each
+ // other.
+ for (int th = 0; th < p1.total; ++th) {
+ env_->StartThread(func, static_cast<void*>(&p1));
+ }
+ for (int th = 0; th < p2.total; ++th) {
+ env_->StartThread(func, static_cast<void*>(&p2));
+ }
+
+ mu1.Lock();
+ while (p1.completed != p1.total) {
+ cv1.Wait();
+ }
+ mu1.Unlock();
+
+ mu2.Lock();
+ while (p2.completed != p2.total) {
+ cv2.Wait();
+ }
+ mu2.Unlock();
+
+ ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+}
+
+TEST_F(ThreadLocalTest, Unref) {
+ auto unref = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+ p.mu->Lock();
+ ++(*p.unref);
+ p.mu->Unlock();
+ };
+
+ // Case 0: no unref triggered if ThreadLocalPtr is never accessed
+ auto func0 = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+
+ p.mu->Lock();
+ ++(p.started);
+ p.cv->SignalAll();
+ while (p.started != p.total) {
+ p.cv->Wait();
+ }
+ p.mu->Unlock();
+ };
+
+ for (int th = 1; th <= 128; th += th) {
+ port::Mutex mu;
+ port::CondVar cv(&mu);
+ int unref_count = 0;
+ Params p(&mu, &cv, &unref_count, th, unref);
+
+ for (int i = 0; i < p.total; ++i) {
+ env_->StartThread(func0, static_cast<void*>(&p));
+ }
+ env_->WaitForJoin();
+ ASSERT_EQ(unref_count, 0);
+ }
+
+ // Case 1: unref triggered by thread exit
+ auto func1 = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+
+ p.mu->Lock();
+ ++(p.started);
+ p.cv->SignalAll();
+ while (p.started != p.total) {
+ p.cv->Wait();
+ }
+ p.mu->Unlock();
+
+ ASSERT_TRUE(p.tls1.Get() == nullptr);
+ ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+ p.tls1.Reset(ptr);
+ p.tls2->Reset(ptr);
+
+ p.tls1.Reset(ptr);
+ p.tls2->Reset(ptr);
+ };
+
+ for (int th = 1; th <= 128; th += th) {
+ port::Mutex mu;
+ port::CondVar cv(&mu);
+ int unref_count = 0;
+ ThreadLocalPtr tls2(unref);
+ Params p(&mu, &cv, &unref_count, th, unref);
+ p.tls2 = &tls2;
+
+ for (int i = 0; i < p.total; ++i) {
+ env_->StartThread(func1, static_cast<void*>(&p));
+ }
+
+ env_->WaitForJoin();
+
+ // N threads x 2 ThreadLocal instance cleanup on thread exit
+ ASSERT_EQ(unref_count, 2 * p.total);
+ }
+
+ // Case 2: unref triggered by ThreadLocal instance destruction
+ auto func2 = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+
+ p.mu->Lock();
+ ++(p.started);
+ p.cv->SignalAll();
+ while (p.started != p.total) {
+ p.cv->Wait();
+ }
+ p.mu->Unlock();
+
+ ASSERT_TRUE(p.tls1.Get() == nullptr);
+ ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+ p.tls1.Reset(ptr);
+ p.tls2->Reset(ptr);
+
+ p.tls1.Reset(ptr);
+ p.tls2->Reset(ptr);
+
+ p.mu->Lock();
+ ++(p.completed);
+ p.cv->SignalAll();
+
+ // Waiting for instruction to exit thread
+ while (p.completed != 0) {
+ p.cv->Wait();
+ }
+ p.mu->Unlock();
+ };
+
+ for (int th = 1; th <= 128; th += th) {
+ port::Mutex mu;
+ port::CondVar cv(&mu);
+ int unref_count = 0;
+ Params p(&mu, &cv, &unref_count, th, unref);
+ p.tls2 = new ThreadLocalPtr(unref);
+
+ for (int i = 0; i < p.total; ++i) {
+ env_->StartThread(func2, static_cast<void*>(&p));
+ }
+
+ // Wait for all threads to finish using Params
+ mu.Lock();
+ while (p.completed != p.total) {
+ cv.Wait();
+ }
+ mu.Unlock();
+
+ // Now destroy one ThreadLocal instance
+ delete p.tls2;
+ p.tls2 = nullptr;
+ // instance destroy for N threads
+ ASSERT_EQ(unref_count, p.total);
+
+ // Signal to exit
+ mu.Lock();
+ p.completed = 0;
+ cv.SignalAll();
+ mu.Unlock();
+ env_->WaitForJoin();
+ // additional N threads exit unref for the left instance
+ ASSERT_EQ(unref_count, 2 * p.total);
+ }
+}
+
+TEST_F(ThreadLocalTest, Swap) {
+ ThreadLocalPtr tls;
+ tls.Reset(reinterpret_cast<void*>(1));
+ ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(nullptr)), 1);
+ ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(2)) == nullptr);
+ ASSERT_EQ(reinterpret_cast<int64_t>(tls.Get()), 2);
+ ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(reinterpret_cast<void*>(3))), 2);
+}
+
+TEST_F(ThreadLocalTest, Scrape) {
+ auto unref = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+ p.mu->Lock();
+ ++(*p.unref);
+ p.mu->Unlock();
+ };
+
+ auto func = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+
+ ASSERT_TRUE(p.tls1.Get() == nullptr);
+ ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+ p.tls1.Reset(ptr);
+ p.tls2->Reset(ptr);
+
+ p.tls1.Reset(ptr);
+ p.tls2->Reset(ptr);
+
+ p.mu->Lock();
+ ++(p.completed);
+ p.cv->SignalAll();
+
+ // Waiting for instruction to exit thread
+ while (p.completed != 0) {
+ p.cv->Wait();
+ }
+ p.mu->Unlock();
+ };
+
+ for (int th = 1; th <= 128; th += th) {
+ port::Mutex mu;
+ port::CondVar cv(&mu);
+ int unref_count = 0;
+ Params p(&mu, &cv, &unref_count, th, unref);
+ p.tls2 = new ThreadLocalPtr(unref);
+
+ for (int i = 0; i < p.total; ++i) {
+ env_->StartThread(func, static_cast<void*>(&p));
+ }
+
+ // Wait for all threads to finish using Params
+ mu.Lock();
+ while (p.completed != p.total) {
+ cv.Wait();
+ }
+ mu.Unlock();
+
+ ASSERT_EQ(unref_count, 0);
+
+ // Scrape all thread local data. No unref at thread
+ // exit or ThreadLocalPtr destruction
+ autovector<void*> ptrs;
+ p.tls1.Scrape(&ptrs, nullptr);
+ p.tls2->Scrape(&ptrs, nullptr);
+ delete p.tls2;
+ // Signal to exit
+ mu.Lock();
+ p.completed = 0;
+ cv.SignalAll();
+ mu.Unlock();
+ env_->WaitForJoin();
+
+ ASSERT_EQ(unref_count, 0);
+ }
+}
+
+TEST_F(ThreadLocalTest, Fold) {
+ auto unref = [](void* ptr) {
+ delete static_cast<std::atomic<int64_t>*>(ptr);
+ };
+ static const int kNumThreads = 16;
+ static const int kItersPerThread = 10;
+ port::Mutex mu;
+ port::CondVar cv(&mu);
+ Params params(&mu, &cv, nullptr, kNumThreads, unref);
+ auto func = [](void* ptr) {
+ auto& p = *static_cast<Params*>(ptr);
+ ASSERT_TRUE(p.tls1.Get() == nullptr);
+ p.tls1.Reset(new std::atomic<int64_t>(0));
+
+ for (int i = 0; i < kItersPerThread; ++i) {
+ static_cast<std::atomic<int64_t>*>(p.tls1.Get())->fetch_add(1);
+ }
+
+ p.mu->Lock();
+ ++(p.completed);
+ p.cv->SignalAll();
+
+ // Waiting for instruction to exit thread
+ while (p.completed != 0) {
+ p.cv->Wait();
+ }
+ p.mu->Unlock();
+ };
+
+ for (int th = 0; th < params.total; ++th) {
+ env_->StartThread(func, static_cast<void*>(&params));
+ }
+
+ // Wait for all threads to finish using Params
+ mu.Lock();
+ while (params.completed != params.total) {
+ cv.Wait();
+ }
+ mu.Unlock();
+
+ // Verify Fold() behavior
+ int64_t sum = 0;
+ params.tls1.Fold(
+ [](void* ptr, void* res) {
+ auto sum_ptr = static_cast<int64_t*>(res);
+ *sum_ptr += static_cast<std::atomic<int64_t>*>(ptr)->load();
+ },
+ &sum);
+ ASSERT_EQ(sum, kNumThreads * kItersPerThread);
+
+ // Signal to exit
+ mu.Lock();
+ params.completed = 0;
+ cv.SignalAll();
+ mu.Unlock();
+ env_->WaitForJoin();
+}
+
+TEST_F(ThreadLocalTest, CompareAndSwap) {
+ ThreadLocalPtr tls;
+ ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(1)) == nullptr);
+ void* expected = reinterpret_cast<void*>(1);
+ // Swap in 2
+ ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+ expected = reinterpret_cast<void*>(100);
+ // Fail Swap, still 2
+ ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+ ASSERT_EQ(expected, reinterpret_cast<void*>(2));
+ // Swap in 3
+ expected = reinterpret_cast<void*>(2);
+ ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(3), expected));
+ ASSERT_EQ(tls.Get(), reinterpret_cast<void*>(3));
+}
+
+namespace {
+
+void* AccessThreadLocal(void* /*arg*/) {
+ TEST_SYNC_POINT("AccessThreadLocal:Start");
+ ThreadLocalPtr tlp;
+ tlp.Reset(new std::string("hello RocksDB"));
+ TEST_SYNC_POINT("AccessThreadLocal:End");
+ return nullptr;
+}
+
+} // namespace
+
+// The following test is disabled as it requires manual steps to run it
+// correctly.
+//
+// Currently we have no way to acess SyncPoint w/o ASAN error when the
+// child thread dies after the main thread dies. So if you manually enable
+// this test and only see an ASAN error on SyncPoint, it means you pass the
+// test.
+TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+ {{"AccessThreadLocal:Start", "MainThreadDiesFirst:End"},
+ {"PosixEnv::~PosixEnv():End", "AccessThreadLocal:End"}});
+
+ // Triggers the initialization of singletons.
+ Env::Default();
+
+#ifndef ROCKSDB_LITE
+ try {
+#endif // ROCKSDB_LITE
+ ROCKSDB_NAMESPACE::port::Thread th(&AccessThreadLocal, nullptr);
+ th.detach();
+ TEST_SYNC_POINT("MainThreadDiesFirst:End");
+#ifndef ROCKSDB_LITE
+ } catch (const std::system_error& ex) {
+ std::cerr << "Start thread: " << ex.code() << std::endl;
+ FAIL();
+ }
+#endif // ROCKSDB_LITE
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/thread_operation.h b/src/rocksdb/util/thread_operation.h
new file mode 100644
index 000000000..b5d4b6906
--- /dev/null
+++ b/src/rocksdb/util/thread_operation.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines the structures for thread operation and state.
+// Thread operations are used to describe high level action of a
+// thread such as doing compaction or flush, while thread state
+// are used to describe lower-level action such as reading /
+// writing a file or waiting for a mutex. Operations and states
+// are designed to be independent. Typically, a thread usually involves
+// in one operation and one state at any specific point in time.
+
+#pragma once
+
+#include "rocksdb/thread_status.h"
+
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+// The structure that describes a major thread operation.
+struct OperationInfo {
+ const ThreadStatus::OperationType type;
+ const std::string name;
+};
+
+// The global operation table.
+//
+// When updating a status of a thread, the pointer of the OperationInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+//
+// Note that it's not designed to be constant as in the future we
+// might consider adding global count to the OperationInfo.
+static OperationInfo global_operation_table[] = {
+ {ThreadStatus::OP_UNKNOWN, ""},
+ {ThreadStatus::OP_COMPACTION, "Compaction"},
+ {ThreadStatus::OP_FLUSH, "Flush"}
+};
+
+struct OperationStageInfo {
+ const ThreadStatus::OperationStage stage;
+ const std::string name;
+};
+
+// A table maintains the mapping from stage type to stage string.
+// Note that the string must be changed accordingly when the
+// associated function name changed.
+static OperationStageInfo global_op_stage_table[] = {
+ {ThreadStatus::STAGE_UNKNOWN, ""},
+ {ThreadStatus::STAGE_FLUSH_RUN,
+ "FlushJob::Run"},
+ {ThreadStatus::STAGE_FLUSH_WRITE_L0,
+ "FlushJob::WriteLevel0Table"},
+ {ThreadStatus::STAGE_COMPACTION_PREPARE,
+ "CompactionJob::Prepare"},
+ {ThreadStatus::STAGE_COMPACTION_RUN,
+ "CompactionJob::Run"},
+ {ThreadStatus::STAGE_COMPACTION_PROCESS_KV,
+ "CompactionJob::ProcessKeyValueCompaction"},
+ {ThreadStatus::STAGE_COMPACTION_INSTALL,
+ "CompactionJob::Install"},
+ {ThreadStatus::STAGE_COMPACTION_SYNC_FILE,
+ "CompactionJob::FinishCompactionOutputFile"},
+ {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH,
+ "MemTableList::PickMemtablesToFlush"},
+ {ThreadStatus::STAGE_MEMTABLE_ROLLBACK,
+ "MemTableList::RollbackMemtableFlush"},
+ {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+ "MemTableList::TryInstallMemtableFlushResults"},
+};
+
+// The structure that describes a state.
+struct StateInfo {
+ const ThreadStatus::StateType type;
+ const std::string name;
+};
+
+// The global state table.
+//
+// When updating a status of a thread, the pointer of the StateInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+static StateInfo global_state_table[] = {
+ {ThreadStatus::STATE_UNKNOWN, ""},
+ {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"},
+};
+
+struct OperationProperty {
+ int code;
+ std::string name;
+};
+
+static OperationProperty compaction_operation_properties[] = {
+ {ThreadStatus::COMPACTION_JOB_ID, "JobID"},
+ {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"},
+ {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"},
+ {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"},
+ {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"},
+ {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"},
+};
+
+static OperationProperty flush_operation_properties[] = {
+ {ThreadStatus::FLUSH_JOB_ID, "JobID"},
+ {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"},
+ {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"}
+};
+
+#else
+
+struct OperationInfo {
+};
+
+struct StateInfo {
+};
+
+#endif // ROCKSDB_USING_THREAD_STATUS
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/threadpool_imp.cc b/src/rocksdb/util/threadpool_imp.cc
new file mode 100644
index 000000000..bf216c956
--- /dev/null
+++ b/src/rocksdb/util/threadpool_imp.cc
@@ -0,0 +1,507 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/threadpool_imp.h"
+
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+
+#ifndef OS_WIN
+# include <unistd.h>
+#endif
+
+#ifdef OS_LINUX
+# include <sys/syscall.h>
+# include <sys/resource.h>
+#endif
+
+#include <stdlib.h>
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+namespace ROCKSDB_NAMESPACE {
+
+void ThreadPoolImpl::PthreadCall(const char* label, int result) {
+ if (result != 0) {
+ fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+ abort();
+ }
+}
+
+struct ThreadPoolImpl::Impl {
+
+ Impl();
+ ~Impl();
+
+ void JoinThreads(bool wait_for_jobs_to_complete);
+
+ void SetBackgroundThreadsInternal(int num, bool allow_reduce);
+ int GetBackgroundThreads();
+
+ unsigned int GetQueueLen() const {
+ return queue_len_.load(std::memory_order_relaxed);
+ }
+
+ void LowerIOPriority();
+
+ void LowerCPUPriority();
+
+ void WakeUpAllThreads() {
+ bgsignal_.notify_all();
+ }
+
+ void BGThread(size_t thread_id);
+
+ void StartBGThreads();
+
+ void Submit(std::function<void()>&& schedule,
+ std::function<void()>&& unschedule, void* tag);
+
+ int UnSchedule(void* arg);
+
+ void SetHostEnv(Env* env) { env_ = env; }
+
+ Env* GetHostEnv() const { return env_; }
+
+ bool HasExcessiveThread() const {
+ return static_cast<int>(bgthreads_.size()) > total_threads_limit_;
+ }
+
+ // Return true iff the current thread is the excessive thread to terminate.
+ // Always terminate the running thread that is added last, even if there are
+ // more than one thread to terminate.
+ bool IsLastExcessiveThread(size_t thread_id) const {
+ return HasExcessiveThread() && thread_id == bgthreads_.size() - 1;
+ }
+
+ bool IsExcessiveThread(size_t thread_id) const {
+ return static_cast<int>(thread_id) >= total_threads_limit_;
+ }
+
+ // Return the thread priority.
+ // This would allow its member-thread to know its priority.
+ Env::Priority GetThreadPriority() const { return priority_; }
+
+ // Set the thread priority.
+ void SetThreadPriority(Env::Priority priority) { priority_ = priority; }
+
+private:
+ static void BGThreadWrapper(void* arg);
+
+ bool low_io_priority_;
+ bool low_cpu_priority_;
+ Env::Priority priority_;
+ Env* env_;
+
+ int total_threads_limit_;
+ std::atomic_uint queue_len_; // Queue length. Used for stats reporting
+ bool exit_all_threads_;
+ bool wait_for_jobs_to_complete_;
+
+ // Entry per Schedule()/Submit() call
+ struct BGItem {
+ void* tag = nullptr;
+ std::function<void()> function;
+ std::function<void()> unschedFunction;
+ };
+
+ using BGQueue = std::deque<BGItem>;
+ BGQueue queue_;
+
+ std::mutex mu_;
+ std::condition_variable bgsignal_;
+ std::vector<port::Thread> bgthreads_;
+};
+
+
+inline
+ThreadPoolImpl::Impl::Impl()
+ :
+ low_io_priority_(false),
+ low_cpu_priority_(false),
+ priority_(Env::LOW),
+ env_(nullptr),
+ total_threads_limit_(0),
+ queue_len_(),
+ exit_all_threads_(false),
+ wait_for_jobs_to_complete_(false),
+ queue_(),
+ mu_(),
+ bgsignal_(),
+ bgthreads_() {
+}
+
+inline
+ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); }
+
+void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) {
+
+ std::unique_lock<std::mutex> lock(mu_);
+ assert(!exit_all_threads_);
+
+ wait_for_jobs_to_complete_ = wait_for_jobs_to_complete;
+ exit_all_threads_ = true;
+ // prevent threads from being recreated right after they're joined, in case
+ // the user is concurrently submitting jobs.
+ total_threads_limit_ = 0;
+
+ lock.unlock();
+
+ bgsignal_.notify_all();
+
+ for (auto& th : bgthreads_) {
+ th.join();
+ }
+
+ bgthreads_.clear();
+
+ exit_all_threads_ = false;
+ wait_for_jobs_to_complete_ = false;
+}
+
+inline
+void ThreadPoolImpl::Impl::LowerIOPriority() {
+ std::lock_guard<std::mutex> lock(mu_);
+ low_io_priority_ = true;
+}
+
+inline
+void ThreadPoolImpl::Impl::LowerCPUPriority() {
+ std::lock_guard<std::mutex> lock(mu_);
+ low_cpu_priority_ = true;
+}
+
+void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
+ bool low_io_priority = false;
+ bool low_cpu_priority = false;
+
+ while (true) {
+ // Wait until there is an item that is ready to run
+ std::unique_lock<std::mutex> lock(mu_);
+ // Stop waiting if the thread needs to do work or needs to terminate.
+ while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
+ (queue_.empty() || IsExcessiveThread(thread_id))) {
+ bgsignal_.wait(lock);
+ }
+
+ if (exit_all_threads_) { // mechanism to let BG threads exit safely
+
+ if (!wait_for_jobs_to_complete_ ||
+ queue_.empty()) {
+ break;
+ }
+ }
+
+ if (IsLastExcessiveThread(thread_id)) {
+ // Current thread is the last generated one and is excessive.
+ // We always terminate excessive thread in the reverse order of
+ // generation time.
+ auto& terminating_thread = bgthreads_.back();
+ terminating_thread.detach();
+ bgthreads_.pop_back();
+
+ if (HasExcessiveThread()) {
+ // There is still at least more excessive thread to terminate.
+ WakeUpAllThreads();
+ }
+ break;
+ }
+
+ auto func = std::move(queue_.front().function);
+ queue_.pop_front();
+
+ queue_len_.store(static_cast<unsigned int>(queue_.size()),
+ std::memory_order_relaxed);
+
+ bool decrease_io_priority = (low_io_priority != low_io_priority_);
+ bool decrease_cpu_priority = (low_cpu_priority != low_cpu_priority_);
+ lock.unlock();
+
+#ifdef OS_LINUX
+ if (decrease_cpu_priority) {
+ setpriority(
+ PRIO_PROCESS,
+ // Current thread.
+ 0,
+ // Lowest priority possible.
+ 19);
+ low_cpu_priority = true;
+ }
+
+ if (decrease_io_priority) {
+#define IOPRIO_CLASS_SHIFT (13)
+#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
+ // Put schedule into IOPRIO_CLASS_IDLE class (lowest)
+ // These system calls only have an effect when used in conjunction
+ // with an I/O scheduler that supports I/O priorities. As at
+ // kernel 2.6.17 the only such scheduler is the Completely
+ // Fair Queuing (CFQ) I/O scheduler.
+ // To change scheduler:
+ // echo cfq > /sys/block/<device_name>/queue/schedule
+ // Tunables to consider:
+ // /sys/block/<device_name>/queue/slice_idle
+ // /sys/block/<device_name>/queue/slice_sync
+ syscall(SYS_ioprio_set, 1, // IOPRIO_WHO_PROCESS
+ 0, // current thread
+ IOPRIO_PRIO_VALUE(3, 0));
+ low_io_priority = true;
+ }
+#else
+ (void)decrease_io_priority; // avoid 'unused variable' error
+ (void)decrease_cpu_priority;
+#endif
+ func();
+ }
+}
+
+// Helper struct for passing arguments when creating threads.
+struct BGThreadMetadata {
+ ThreadPoolImpl::Impl* thread_pool_;
+ size_t thread_id_; // Thread count in the thread.
+ BGThreadMetadata(ThreadPoolImpl::Impl* thread_pool, size_t thread_id)
+ : thread_pool_(thread_pool), thread_id_(thread_id) {}
+};
+
+void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
+ BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
+ size_t thread_id = meta->thread_id_;
+ ThreadPoolImpl::Impl* tp = meta->thread_pool_;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ // initialize it because compiler isn't good enough to see we don't use it
+ // uninitialized
+ ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES;
+ switch (tp->GetThreadPriority()) {
+ case Env::Priority::HIGH:
+ thread_type = ThreadStatus::HIGH_PRIORITY;
+ break;
+ case Env::Priority::LOW:
+ thread_type = ThreadStatus::LOW_PRIORITY;
+ break;
+ case Env::Priority::BOTTOM:
+ thread_type = ThreadStatus::BOTTOM_PRIORITY;
+ break;
+ case Env::Priority::USER:
+ thread_type = ThreadStatus::USER;
+ break;
+ case Env::Priority::TOTAL:
+ assert(false);
+ return;
+ }
+ assert(thread_type != ThreadStatus::NUM_THREAD_TYPES);
+ ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type);
+#endif
+ delete meta;
+ tp->BGThread(thread_id);
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ ThreadStatusUtil::UnregisterThread();
+#endif
+ return;
+}
+
+void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num,
+ bool allow_reduce) {
+ std::lock_guard<std::mutex> lock(mu_);
+ if (exit_all_threads_) {
+ return;
+ }
+ if (num > total_threads_limit_ ||
+ (num < total_threads_limit_ && allow_reduce)) {
+ total_threads_limit_ = std::max(0, num);
+ WakeUpAllThreads();
+ StartBGThreads();
+ }
+}
+
+int ThreadPoolImpl::Impl::GetBackgroundThreads() {
+ std::unique_lock<std::mutex> lock(mu_);
+ return total_threads_limit_;
+}
+
+void ThreadPoolImpl::Impl::StartBGThreads() {
+ // Start background thread if necessary
+ while ((int)bgthreads_.size() < total_threads_limit_) {
+
+ port::Thread p_t(&BGThreadWrapper,
+ new BGThreadMetadata(this, bgthreads_.size()));
+
+// Set the thread name to aid debugging
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+ auto th_handle = p_t.native_handle();
+ std::string thread_priority = Env::PriorityToString(GetThreadPriority());
+ std::ostringstream thread_name_stream;
+ thread_name_stream << "rocksdb:";
+ for (char c : thread_priority) {
+ thread_name_stream << static_cast<char>(tolower(c));
+ }
+ thread_name_stream << bgthreads_.size();
+ pthread_setname_np(th_handle, thread_name_stream.str().c_str());
+#endif
+#endif
+ bgthreads_.push_back(std::move(p_t));
+ }
+}
+
+void ThreadPoolImpl::Impl::Submit(std::function<void()>&& schedule,
+ std::function<void()>&& unschedule, void* tag) {
+
+ std::lock_guard<std::mutex> lock(mu_);
+
+ if (exit_all_threads_) {
+ return;
+ }
+
+ StartBGThreads();
+
+ // Add to priority queue
+ queue_.push_back(BGItem());
+
+ auto& item = queue_.back();
+ item.tag = tag;
+ item.function = std::move(schedule);
+ item.unschedFunction = std::move(unschedule);
+
+ queue_len_.store(static_cast<unsigned int>(queue_.size()),
+ std::memory_order_relaxed);
+
+ if (!HasExcessiveThread()) {
+ // Wake up at least one waiting thread.
+ bgsignal_.notify_one();
+ } else {
+ // Need to wake up all threads to make sure the one woken
+ // up is not the one to terminate.
+ WakeUpAllThreads();
+ }
+}
+
+int ThreadPoolImpl::Impl::UnSchedule(void* arg) {
+ int count = 0;
+
+ std::vector<std::function<void()>> candidates;
+ {
+ std::lock_guard<std::mutex> lock(mu_);
+
+ // Remove from priority queue
+ BGQueue::iterator it = queue_.begin();
+ while (it != queue_.end()) {
+ if (arg == (*it).tag) {
+ if (it->unschedFunction) {
+ candidates.push_back(std::move(it->unschedFunction));
+ }
+ it = queue_.erase(it);
+ count++;
+ } else {
+ ++it;
+ }
+ }
+ queue_len_.store(static_cast<unsigned int>(queue_.size()),
+ std::memory_order_relaxed);
+ }
+
+
+ // Run unschedule functions outside the mutex
+ for (auto& f : candidates) {
+ f();
+ }
+
+ return count;
+}
+
+ThreadPoolImpl::ThreadPoolImpl() :
+ impl_(new Impl()) {
+}
+
+
+ThreadPoolImpl::~ThreadPoolImpl() {
+}
+
+void ThreadPoolImpl::JoinAllThreads() {
+ impl_->JoinThreads(false);
+}
+
+void ThreadPoolImpl::SetBackgroundThreads(int num) {
+ impl_->SetBackgroundThreadsInternal(num, true);
+}
+
+int ThreadPoolImpl::GetBackgroundThreads() {
+ return impl_->GetBackgroundThreads();
+}
+
+unsigned int ThreadPoolImpl::GetQueueLen() const {
+ return impl_->GetQueueLen();
+}
+
+void ThreadPoolImpl::WaitForJobsAndJoinAllThreads() {
+ impl_->JoinThreads(true);
+}
+
+void ThreadPoolImpl::LowerIOPriority() {
+ impl_->LowerIOPriority();
+}
+
+void ThreadPoolImpl::LowerCPUPriority() {
+ impl_->LowerCPUPriority();
+}
+
+void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) {
+ impl_->SetBackgroundThreadsInternal(num, false);
+}
+
+void ThreadPoolImpl::SubmitJob(const std::function<void()>& job) {
+ auto copy(job);
+ impl_->Submit(std::move(copy), std::function<void()>(), nullptr);
+}
+
+
+void ThreadPoolImpl::SubmitJob(std::function<void()>&& job) {
+ impl_->Submit(std::move(job), std::function<void()>(), nullptr);
+}
+
+void ThreadPoolImpl::Schedule(void(*function)(void* arg1), void* arg,
+ void* tag, void(*unschedFunction)(void* arg)) {
+ if (unschedFunction == nullptr) {
+ impl_->Submit(std::bind(function, arg), std::function<void()>(), tag);
+ } else {
+ impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg),
+ tag);
+ }
+}
+
+int ThreadPoolImpl::UnSchedule(void* arg) {
+ return impl_->UnSchedule(arg);
+}
+
+void ThreadPoolImpl::SetHostEnv(Env* env) { impl_->SetHostEnv(env); }
+
+Env* ThreadPoolImpl::GetHostEnv() const { return impl_->GetHostEnv(); }
+
+// Return the thread priority.
+// This would allow its member-thread to know its priority.
+Env::Priority ThreadPoolImpl::GetThreadPriority() const {
+ return impl_->GetThreadPriority();
+}
+
+// Set the thread priority.
+void ThreadPoolImpl::SetThreadPriority(Env::Priority priority) {
+ impl_->SetThreadPriority(priority);
+}
+
+ThreadPool* NewThreadPool(int num_threads) {
+ ThreadPoolImpl* thread_pool = new ThreadPoolImpl();
+ thread_pool->SetBackgroundThreads(num_threads);
+ return thread_pool;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/threadpool_imp.h b/src/rocksdb/util/threadpool_imp.h
new file mode 100644
index 000000000..e3c12577b
--- /dev/null
+++ b/src/rocksdb/util/threadpool_imp.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "rocksdb/threadpool.h"
+#include "rocksdb/env.h"
+
+#include <memory>
+#include <functional>
+
+namespace ROCKSDB_NAMESPACE {
+
+class ThreadPoolImpl : public ThreadPool {
+ public:
+ ThreadPoolImpl();
+ ~ThreadPoolImpl();
+
+ ThreadPoolImpl(ThreadPoolImpl&&) = delete;
+ ThreadPoolImpl& operator=(ThreadPoolImpl&&) = delete;
+
+ // Implement ThreadPool interfaces
+
+ // Wait for all threads to finish.
+ // Discards all the jobs that did not
+ // start executing and waits for those running
+ // to complete
+ void JoinAllThreads() override;
+
+ // Set the number of background threads that will be executing the
+ // scheduled jobs.
+ void SetBackgroundThreads(int num) override;
+ int GetBackgroundThreads() override;
+
+ // Get the number of jobs scheduled in the ThreadPool queue.
+ unsigned int GetQueueLen() const override;
+
+ // Waits for all jobs to complete those
+ // that already started running and those that did not
+ // start yet
+ void WaitForJobsAndJoinAllThreads() override;
+
+ // Make threads to run at a lower kernel IO priority
+ // Currently only has effect on Linux
+ void LowerIOPriority();
+
+ // Make threads to run at a lower kernel CPU priority
+ // Currently only has effect on Linux
+ void LowerCPUPriority();
+
+ // Ensure there is at aleast num threads in the pool
+ // but do not kill threads if there are more
+ void IncBackgroundThreadsIfNeeded(int num);
+
+ // Submit a fire and forget job
+ // These jobs can not be unscheduled
+
+ // This allows to submit the same job multiple times
+ void SubmitJob(const std::function<void()>&) override;
+ // This moves the function in for efficiency
+ void SubmitJob(std::function<void()>&&) override;
+
+ // Schedule a job with an unschedule tag and unschedule function
+ // Can be used to filter and unschedule jobs by a tag
+ // that are still in the queue and did not start running
+ void Schedule(void (*function)(void* arg1), void* arg, void* tag,
+ void (*unschedFunction)(void* arg));
+
+ // Filter jobs that are still in a queue and match
+ // the given tag. Remove them from a queue if any
+ // and for each such job execute an unschedule function
+ // if such was given at scheduling time.
+ int UnSchedule(void* tag);
+
+ void SetHostEnv(Env* env);
+
+ Env* GetHostEnv() const;
+
+ // Return the thread priority.
+ // This would allow its member-thread to know its priority.
+ Env::Priority GetThreadPriority() const;
+
+ // Set the thread priority.
+ void SetThreadPriority(Env::Priority priority);
+
+ static void PthreadCall(const char* label, int result);
+
+ struct Impl;
+
+ private:
+
+ // Current public virtual interface does not provide usable
+ // functionality and thus can not be used internally to
+ // facade different implementations.
+ //
+ // We propose a pimpl idiom in order to easily replace the thread pool impl
+ // w/o touching the header file but providing a different .cc potentially
+ // CMake option driven.
+ //
+ // Another option is to introduce a Env::MakeThreadPool() virtual interface
+ // and override the environment. This would require refactoring ThreadPool usage.
+ //
+ // We can also combine these two approaches
+ std::unique_ptr<Impl> impl_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/timer_queue.h b/src/rocksdb/util/timer_queue.h
new file mode 100644
index 000000000..3bd517531
--- /dev/null
+++ b/src/rocksdb/util/timer_queue.h
@@ -0,0 +1,230 @@
+// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law. You can copy, modify, distribute and perform the
+// work, even for commercial purposes, all without asking permission.
+
+#pragma once
+
+#include <assert.h>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "port/port.h"
+#include "test_util/sync_point.h"
+
+// Allows execution of handlers at a specified time in the future
+// Guarantees:
+// - All handlers are executed ONCE, even if cancelled (aborted parameter will
+// be set to true)
+// - If TimerQueue is destroyed, it will cancel all handlers.
+// - Handlers are ALWAYS executed in the Timer Queue worker thread.
+// - Handlers execution order is NOT guaranteed
+//
+////////////////////////////////////////////////////////////////////////////////
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+class TimerQueue {
+ public:
+ TimerQueue() : m_th(&TimerQueue::run, this) {}
+
+ ~TimerQueue() { shutdown(); }
+
+ // This function is not thread-safe.
+ void shutdown() {
+ if (closed_) {
+ return;
+ }
+ cancelAll();
+ // Abusing the timer queue to trigger the shutdown.
+ add(0, [this](bool) {
+ m_finish = true;
+ return std::make_pair(false, 0);
+ });
+ m_th.join();
+ closed_ = true;
+ }
+
+ // Adds a new timer
+ // \return
+ // Returns the ID of the new timer. You can use this ID to cancel the
+ // timer
+ uint64_t add(int64_t milliseconds,
+ std::function<std::pair<bool, int64_t>(bool)> handler) {
+ WorkItem item;
+ Clock::time_point tp = Clock::now();
+ item.end = tp + std::chrono::milliseconds(milliseconds);
+ TEST_SYNC_POINT_CALLBACK("TimeQueue::Add:item.end", &item.end);
+ item.period = milliseconds;
+ item.handler = std::move(handler);
+
+ std::unique_lock<std::mutex> lk(m_mtx);
+ uint64_t id = ++m_idcounter;
+ item.id = id;
+ m_items.push(std::move(item));
+
+ // Something changed, so wake up timer thread
+ m_checkWork.notify_one();
+ return id;
+ }
+
+ // Cancels the specified timer
+ // \return
+ // 1 if the timer was cancelled.
+ // 0 if you were too late to cancel (or the timer ID was never valid to
+ // start with)
+ size_t cancel(uint64_t id) {
+ // Instead of removing the item from the container (thus breaking the
+ // heap integrity), we set the item as having no handler, and put
+ // that handler on a new item at the top for immediate execution
+ // The timer thread will then ignore the original item, since it has no
+ // handler.
+ std::unique_lock<std::mutex> lk(m_mtx);
+ for (auto&& item : m_items.getContainer()) {
+ if (item.id == id && item.handler) {
+ WorkItem newItem;
+ // Zero time, so it stays at the top for immediate execution
+ newItem.end = Clock::time_point();
+ newItem.id = 0; // Means it is a canceled item
+ // Move the handler from item to newitem (thus clearing item)
+ newItem.handler = std::move(item.handler);
+ m_items.push(std::move(newItem));
+
+ // Something changed, so wake up timer thread
+ m_checkWork.notify_one();
+ return 1;
+ }
+ }
+ return 0;
+ }
+
+ // Cancels all timers
+ // \return
+ // The number of timers cancelled
+ size_t cancelAll() {
+ // Setting all "end" to 0 (for immediate execution) is ok,
+ // since it maintains the heap integrity
+ std::unique_lock<std::mutex> lk(m_mtx);
+ m_cancel = true;
+ for (auto&& item : m_items.getContainer()) {
+ if (item.id && item.handler) {
+ item.end = Clock::time_point();
+ item.id = 0;
+ }
+ }
+ auto ret = m_items.size();
+
+ m_checkWork.notify_one();
+ return ret;
+ }
+
+ private:
+ using Clock = std::chrono::steady_clock;
+ TimerQueue(const TimerQueue&) = delete;
+ TimerQueue& operator=(const TimerQueue&) = delete;
+
+ void run() {
+ std::unique_lock<std::mutex> lk(m_mtx);
+ while (!m_finish) {
+ auto end = calcWaitTime_lock();
+ if (end.first) {
+ // Timers found, so wait until it expires (or something else
+ // changes)
+ m_checkWork.wait_until(lk, end.second);
+ } else {
+ // No timers exist, so wait forever until something changes
+ m_checkWork.wait(lk);
+ }
+
+ // Check and execute as much work as possible, such as, all expired
+ // timers
+ checkWork(&lk);
+ }
+
+ // If we are shutting down, we should not have any items left,
+ // since the shutdown cancels all items
+ assert(m_items.size() == 0);
+ }
+
+ std::pair<bool, Clock::time_point> calcWaitTime_lock() {
+ while (m_items.size()) {
+ if (m_items.top().handler) {
+ // Item present, so return the new wait time
+ return std::make_pair(true, m_items.top().end);
+ } else {
+ // Discard empty handlers (they were cancelled)
+ m_items.pop();
+ }
+ }
+
+ // No items found, so return no wait time (causes the thread to wait
+ // indefinitely)
+ return std::make_pair(false, Clock::time_point());
+ }
+
+ void checkWork(std::unique_lock<std::mutex>* lk) {
+ while (m_items.size() && m_items.top().end <= Clock::now()) {
+ WorkItem item(m_items.top());
+ m_items.pop();
+
+ if (item.handler) {
+ (*lk).unlock();
+ auto reschedule_pair = item.handler(item.id == 0);
+ (*lk).lock();
+ if (!m_cancel && reschedule_pair.first) {
+ int64_t new_period = (reschedule_pair.second == -1)
+ ? item.period
+ : reschedule_pair.second;
+
+ item.period = new_period;
+ item.end = Clock::now() + std::chrono::milliseconds(new_period);
+ m_items.push(std::move(item));
+ }
+ }
+ }
+ }
+
+ bool m_finish = false;
+ bool m_cancel = false;
+ uint64_t m_idcounter = 0;
+ std::condition_variable m_checkWork;
+
+ struct WorkItem {
+ Clock::time_point end;
+ int64_t period;
+ uint64_t id; // id==0 means it was cancelled
+ std::function<std::pair<bool, int64_t>(bool)> handler;
+ bool operator>(const WorkItem& other) const { return end > other.end; }
+ };
+
+ std::mutex m_mtx;
+ // Inheriting from priority_queue, so we can access the internal container
+ class Queue : public std::priority_queue<WorkItem, std::vector<WorkItem>,
+ std::greater<WorkItem>> {
+ public:
+ std::vector<WorkItem>& getContainer() { return this->c; }
+ } m_items;
+ ROCKSDB_NAMESPACE::port::Thread m_th;
+ bool closed_ = false;
+};
diff --git a/src/rocksdb/util/timer_queue_test.cc b/src/rocksdb/util/timer_queue_test.cc
new file mode 100644
index 000000000..5f5f08f21
--- /dev/null
+++ b/src/rocksdb/util/timer_queue_test.cc
@@ -0,0 +1,72 @@
+// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free
+// to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law. You can copy, modify, distribute and perform the
+// work, even for
+// commercial purposes, all without asking permission. See Other Information
+// below.
+//
+
+#include "util/timer_queue.h"
+#include <future>
+
+namespace Timing {
+
+using Clock = std::chrono::high_resolution_clock;
+double now() {
+ static auto start = Clock::now();
+ return std::chrono::duration<double, std::milli>(Clock::now() - start)
+ .count();
+}
+
+} // namespace Timing
+
+int main() {
+ TimerQueue q;
+
+ double tnow = Timing::now();
+
+ q.add(10000, [tnow](bool aborted) mutable {
+ printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+ return std::make_pair(false, 0);
+ });
+ q.add(10001, [tnow](bool aborted) mutable {
+ printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+ return std::make_pair(false, 0);
+ });
+
+ q.add(1000, [tnow](bool aborted) mutable {
+ printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+ return std::make_pair(!aborted, 1000);
+ });
+
+ auto id = q.add(2000, [tnow](bool aborted) mutable {
+ printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+ return std::make_pair(!aborted, 2000);
+ });
+
+ (void)id;
+ // auto ret = q.cancel(id);
+ // assert(ret == 1);
+ // q.cancelAll();
+
+ return 0;
+}
+//////////////////////////////////////////
diff --git a/src/rocksdb/util/user_comparator_wrapper.h b/src/rocksdb/util/user_comparator_wrapper.h
new file mode 100644
index 000000000..fdb27f33e
--- /dev/null
+++ b/src/rocksdb/util/user_comparator_wrapper.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Wrapper of user comparator, with auto increment to
+// perf_context.user_key_comparison_count.
+class UserComparatorWrapper final : public Comparator {
+ public:
+ explicit UserComparatorWrapper(const Comparator* const user_cmp)
+ : user_comparator_(user_cmp) {}
+
+ ~UserComparatorWrapper() = default;
+
+ const Comparator* user_comparator() const { return user_comparator_; }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ PERF_COUNTER_ADD(user_key_comparison_count, 1);
+ return user_comparator_->Compare(a, b);
+ }
+
+ bool Equal(const Slice& a, const Slice& b) const override {
+ PERF_COUNTER_ADD(user_key_comparison_count, 1);
+ return user_comparator_->Equal(a, b);
+ }
+
+ const char* Name() const override { return user_comparator_->Name(); }
+
+ void FindShortestSeparator(std::string* start,
+ const Slice& limit) const override {
+ return user_comparator_->FindShortestSeparator(start, limit);
+ }
+
+ void FindShortSuccessor(std::string* key) const override {
+ return user_comparator_->FindShortSuccessor(key);
+ }
+
+ const Comparator* GetRootComparator() const override {
+ return user_comparator_->GetRootComparator();
+ }
+
+ bool IsSameLengthImmediateSuccessor(const Slice& s,
+ const Slice& t) const override {
+ return user_comparator_->IsSameLengthImmediateSuccessor(s, t);
+ }
+
+ bool CanKeysWithDifferentByteContentsBeEqual() const override {
+ return user_comparator_->CanKeysWithDifferentByteContentsBeEqual();
+ }
+
+ private:
+ const Comparator* user_comparator_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/util.h b/src/rocksdb/util/util.h
new file mode 100644
index 000000000..a5fd36490
--- /dev/null
+++ b/src/rocksdb/util/util.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef FALLTHROUGH_INTENDED
+#if defined(__clang__)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#else
+#define FALLTHROUGH_INTENDED do {} while (0)
+#endif
+#endif
diff --git a/src/rocksdb/util/vector_iterator.h b/src/rocksdb/util/vector_iterator.h
new file mode 100644
index 000000000..fc26ec0c0
--- /dev/null
+++ b/src/rocksdb/util/vector_iterator.h
@@ -0,0 +1,101 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Iterator over a vector of keys/values
+class VectorIterator : public InternalIterator {
+ public:
+ VectorIterator(std::vector<std::string> keys, std::vector<std::string> values,
+ const InternalKeyComparator* icmp)
+ : keys_(std::move(keys)),
+ values_(std::move(values)),
+ indexed_cmp_(icmp, &keys_),
+ current_(0) {
+ assert(keys_.size() == values_.size());
+
+ indices_.reserve(keys_.size());
+ for (size_t i = 0; i < keys_.size(); i++) {
+ indices_.push_back(i);
+ }
+ std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
+ }
+
+ virtual bool Valid() const override {
+ return !indices_.empty() && current_ < indices_.size();
+ }
+
+ virtual void SeekToFirst() override { current_ = 0; }
+ virtual void SeekToLast() override { current_ = indices_.size() - 1; }
+
+ virtual void Seek(const Slice& target) override {
+ current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+ indexed_cmp_) -
+ indices_.begin();
+ }
+
+ virtual void SeekForPrev(const Slice& target) override {
+ current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+ indexed_cmp_) -
+ indices_.begin();
+ if (!Valid()) {
+ SeekToLast();
+ } else {
+ Prev();
+ }
+ }
+
+ virtual void Next() override { current_++; }
+ virtual void Prev() override { current_--; }
+
+ virtual Slice key() const override {
+ return Slice(keys_[indices_[current_]]);
+ }
+ virtual Slice value() const override {
+ return Slice(values_[indices_[current_]]);
+ }
+
+ virtual Status status() const override { return Status::OK(); }
+
+ virtual bool IsKeyPinned() const override { return true; }
+ virtual bool IsValuePinned() const override { return true; }
+
+ private:
+ struct IndexedKeyComparator {
+ IndexedKeyComparator(const InternalKeyComparator* c,
+ const std::vector<std::string>* ks)
+ : cmp(c), keys(ks) {}
+
+ bool operator()(size_t a, size_t b) const {
+ return cmp->Compare((*keys)[a], (*keys)[b]) < 0;
+ }
+
+ bool operator()(size_t a, const Slice& b) const {
+ return cmp->Compare((*keys)[a], b) < 0;
+ }
+
+ bool operator()(const Slice& a, size_t b) const {
+ return cmp->Compare(a, (*keys)[b]) < 0;
+ }
+
+ const InternalKeyComparator* cmp;
+ const std::vector<std::string>* keys;
+ };
+
+ std::vector<std::string> keys_;
+ std::vector<std::string> values_;
+ IndexedKeyComparator indexed_cmp_;
+ std::vector<size_t> indices_;
+ size_t current_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/xxh3p.h b/src/rocksdb/util/xxh3p.h
new file mode 100644
index 000000000..0a3cd9808
--- /dev/null
+++ b/src/rocksdb/util/xxh3p.h
@@ -0,0 +1,1648 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+/*
+ xxHash - Extremely Fast Hash algorithm
+ Development source file for `xxh3`
+ Copyright (C) 2019-present, Yann Collet.
+
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* RocksDB Note: This file contains a preview release (xxhash repository
+ version 0.7.2) of XXH3 that is unlikely to be compatible with the final
+ version of XXH3. We have therefore renamed this XXH3p ("preview"), for
+ clarity so that we can continue to use this version even after
+ integrating a newer incompatible version.
+*/
+
+/* Note :
+ This file is separated for development purposes.
+ It will be integrated into `xxhash.c` when development phase is complete.
+*/
+
+#ifndef XXH3p_H
+#define XXH3p_H
+
+
+/* === Dependencies === */
+
+#undef XXH_INLINE_ALL /* in case it's already defined */
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+
+
+/* === Compiler specifics === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
+# define XXH_RESTRICT restrict
+#else
+/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
+# define XXH_RESTRICT /* disable */
+#endif
+
+#if defined(__GNUC__)
+# if defined(__AVX2__)
+# include <immintrin.h>
+# elif defined(__SSE2__)
+# include <emmintrin.h>
+# elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+# define inline __inline__ /* clang bug */
+# include <arm_neon.h>
+# undef inline
+# endif
+#elif defined(_MSC_VER)
+# include <intrin.h>
+#endif
+
+/*
+ * Sanity check.
+ *
+ * XXH3 only requires these features to be efficient:
+ *
+ * - Usable unaligned access
+ * - A 32-bit or 64-bit ALU
+ * - If 32-bit, a decent ADC instruction
+ * - A 32 or 64-bit multiply with a 64-bit result
+ *
+ * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
+ * classic 16-bit only subset of ARM's instruction set.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will give a warning.
+ *
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you probably meant to compileh for a newer
+ * architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+# warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0
+#define XXH_SSE2 1
+#define XXH_AVX2 2
+#define XXH_NEON 3
+#define XXH_VSX 4
+
+#ifndef XXH_VECTOR /* can be defined on command line */
+# if defined(__AVX2__)
+# define XXH_VECTOR XXH_AVX2
+# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+# define XXH_VECTOR XXH_SSE2
+# elif defined(__GNUC__) /* msvc support maybe later */ \
+ && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+ && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+# define XXH_VECTOR XXH_NEON
+# elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
+# define XXH_VECTOR XXH_VSX
+# else
+# define XXH_VECTOR XXH_SCALAR
+# endif
+#endif
+
+/* control alignment of accumulator,
+ * for compatibility with fast vector loads */
+#ifndef XXH_ACC_ALIGN
+# if XXH_VECTOR == 0 /* scalar */
+# define XXH_ACC_ALIGN 8
+# elif XXH_VECTOR == 1 /* sse2 */
+# define XXH_ACC_ALIGN 16
+# elif XXH_VECTOR == 2 /* avx2 */
+# define XXH_ACC_ALIGN 32
+# elif XXH_VECTOR == 3 /* neon */
+# define XXH_ACC_ALIGN 16
+# elif XXH_VECTOR == 4 /* vsx */
+# define XXH_ACC_ALIGN 16
+# endif
+#endif
+
+/* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
+#if defined(_MSC_VER) && defined(_M_IX86)
+# include <intrin.h>
+# define XXH_mult32to64(x, y) __emulu(x, y)
+#else
+# define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
+#endif
+
+/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
+ * there is a lot of mischief with endianness. */
+#if XXH_VECTOR == XXH_VSX
+# include <altivec.h>
+# undef vector
+typedef __vector unsigned long long U64x2;
+typedef __vector unsigned char U8x16;
+typedef __vector unsigned U32x4;
+
+#ifndef XXH_VSX_BE
+# if defined(__BIG_ENDIAN__) \
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+# define XXH_VSX_BE 1
+# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+# warning "-maltivec=be is not recommended. Please use native endianness."
+# define XXH_VSX_BE 1
+# else
+# define XXH_VSX_BE 0
+# endif
+#endif
+
+/* We need some helpers for big endian mode. */
+#if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+# ifdef __POWER9_VECTOR__
+# define XXH_vec_revb vec_revb
+# else
+XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
+{
+ U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+ 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+ return vec_perm(val, val, vByteSwap);
+}
+# endif
+
+/* Power8 Crypto gives us vpermxor which is very handy for
+ * PPC64EB.
+ *
+ * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
+ * {
+ * U8x16 ret;
+ * for (int i = 0; i < 16; i++) {
+ * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
+ * }
+ * return ret;
+ * }
+ *
+ * Because both of the main loops load the key, swap, and xor it with input,
+ * we can combine the key swap into this instruction.
+ */
+# ifdef vec_permxor
+# define XXH_vec_permxor vec_permxor
+# else
+# define XXH_vec_permxor __builtin_crypto_vpermxor
+# endif
+#endif /* XXH_VSX_BE */
+/*
+ * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
+ * vec_mule.
+ *
+ * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
+ * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
+ * GCC needs inline assembly. */
+#if __has_builtin(__builtin_altivec_vmuleuw)
+# define XXH_vec_mulo __builtin_altivec_vmulouw
+# define XXH_vec_mule __builtin_altivec_vmuleuw
+#else
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
+ U64x2 result;
+ __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+ return result;
+}
+XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
+ U64x2 result;
+ __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+ return result;
+}
+#endif /* __has_builtin(__builtin_altivec_vmuleuw) */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
+#else
+# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
+# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+# else
+# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
+# endif
+#endif /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3p_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
+# error "default keyset is not large enough"
+#endif
+
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+ 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+ 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+ 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+ 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+ 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+ 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+ 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+ 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+ 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+ 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+ 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+ 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * GCC for x86 has a tendency to use SSE in this loop. While it
+ * successfully avoids swapping (as MUL overwrites EAX and EDX), it
+ * slows it down because instead of free register swap shifts, it
+ * must use pshufd and punpckl/hd.
+ *
+ * To prevent this, we use this attribute to shut off SSE.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+ /*
+ * GCC/Clang __uint128_t method.
+ *
+ * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+ * This is usually the best way as it usually uses a native long 64-bit
+ * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+ *
+ * Usually.
+ *
+ * Despite being a 32-bit platform, Clang (and emscripten) define this
+ * type despite not having the arithmetic for it. This results in a
+ * laggy compiler builtin call which calculates a full 128-bit multiply.
+ * In that case it is best to use the portable one.
+ * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+ */
+#if defined(__GNUC__) && !defined(__wasm__) \
+ && defined(__SIZEOF_INT128__) \
+ || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+ __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+ XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
+ return r128;
+
+ /*
+ * MSVC for x64's _umul128 method.
+ *
+ * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+ *
+ * This compiles to single operand MUL on x64.
+ */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+# pragma intrinsic(_umul128)
+#endif
+ xxh_u64 product_high;
+ xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+ XXH128_hash_t const r128 = { product_low, product_high };
+ return r128;
+
+#else
+ /*
+ * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+ *
+ * This is a fast and simple grade school multiply, which is shown
+ * below with base 10 arithmetic instead of base 0x100000000.
+ *
+ * 9 3 // D2 lhs = 93
+ * x 7 5 // D2 rhs = 75
+ * ----------
+ * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
+ * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
+ * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
+ * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
+ * ---------
+ * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21
+ * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63
+ * ---------
+ * 6 9 7 5
+ *
+ * The reasons for adding the products like this are:
+ * 1. It avoids manual carry tracking. Just like how
+ * (9 * 9) + 9 + 9 = 99, the same applies with this for
+ * UINT64_MAX. This avoids a lot of complexity.
+ *
+ * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
+ * instruction available in ARMv6+ A32/T32, which is shown below:
+ *
+ * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+ * {
+ * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+ * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+ * *RdHi = (xxh_u32)(product >> 32);
+ * }
+ *
+ * This instruction was designed for efficient long multiplication,
+ * and allows this to be calculated in only 4 instructions which
+ * is comparable to some 64-bit ALUs.
+ *
+ * 3. It isn't terrible on other platforms. Usually this will be
+ * a couple of 32-bit ADD/ADCs.
+ */
+
+ /* First calculate all of the cross products. */
+ xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+ xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
+ xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+ xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
+
+ /* Now add the products together. These will never overflow. */
+ xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+ xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
+ xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+ XXH128_hash_t r128 = { lower, upper };
+ return r128;
+#endif
+}
+
+/*
+ * We want to keep the attribute here because a target switch
+ * disables inlining.
+ *
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ * The reason for the separate function is to prevent passing
+ * too many structs around by value. This will hopefully inline
+ * the multiply, but we don't force it.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static xxh_u64
+XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+ XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+ return product.low64 ^ product.high64;
+}
+
+
+static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
+{
+ h64 ^= h64 >> 37;
+ h64 *= PRIME64_3;
+ h64 ^= h64 >> 32;
+ return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(input != NULL);
+ XXH_ASSERT(1 <= len && len <= 3);
+ XXH_ASSERT(secret != NULL);
+ { xxh_u8 const c1 = input[0];
+ xxh_u8 const c2 = input[len >> 1];
+ xxh_u8 const c3 = input[len - 1];
+ xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
+ xxh_u64 const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
+ xxh_u64 const mixed = keyed * PRIME64_1;
+ return XXH3p_avalanche(mixed);
+ }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(input != NULL);
+ XXH_ASSERT(secret != NULL);
+ XXH_ASSERT(4 <= len && len <= 8);
+ { xxh_u32 const input_lo = XXH_readLE32(input);
+ xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+ xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
+ xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
+ xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
+ return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
+ }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(input != NULL);
+ XXH_ASSERT(secret != NULL);
+ XXH_ASSERT(9 <= len && len <= 16);
+ { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
+ xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
+ xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
+ return XXH3p_avalanche(acc);
+ }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(len <= 16);
+ { if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
+ if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
+ if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
+ /*
+ * RocksDB modification from XXH3 preview: zero result for empty
+ * string can be problematic for multiplication-based algorithms.
+ * Return a hash of the seed instead.
+ */
+ return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
+ }
+}
+
+
+/* === Long Keys === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
+
+XXH_FORCE_INLINE void
+XXH3p_accumulate_512( void* XXH_RESTRICT acc,
+ const void* XXH_RESTRICT input,
+ const void* XXH_RESTRICT secret,
+ XXH3p_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+ XXH_ASSERT((((size_t)acc) & 31) == 0);
+ { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
+ const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+ const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+
+ size_t i;
+ for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+ __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
+ __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
+ __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+ __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+ if (accWidth == XXH3p_acc_128bits) {
+ __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+ __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
+ xacc[i] = _mm256_add_epi64(product, sum);
+ } else { /* XXH3p_acc_64bits */
+ __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+ xacc[i] = _mm256_add_epi64(product, sum);
+ }
+ } }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+ XXH_ASSERT((((size_t)acc) & 15) == 0);
+ { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
+ const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+ const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+
+ size_t i;
+ for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+ __m128i const data_vec = _mm_loadu_si128 (xinput+i);
+ __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
+ __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+ __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+ if (accWidth == XXH3p_acc_128bits) {
+ __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+ __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
+ xacc[i] = _mm_add_epi64(product, sum);
+ } else { /* XXH3p_acc_64bits */
+ __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+ xacc[i] = _mm_add_epi64(product, sum);
+ }
+ } }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+ XXH_ASSERT((((size_t)acc) & 15) == 0);
+ {
+ XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+ /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+ uint8_t const* const xinput = (const uint8_t *) input;
+ uint8_t const* const xsecret = (const uint8_t *) secret;
+
+ size_t i;
+ for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
+ /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+ * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+ * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+ * a lot.
+ * aarch64 not only uses an entirely different syntax, but it requires three
+ * instructions...
+ * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly
+ * zip1 v3.2s, v0.2s, v1.2s // first zip
+ * zip2 v2.2s, v0.2s, v1.2s // second zip
+ * ...to do what ARM does in one:
+ * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */
+
+ /* data_vec = xsecret[i]; */
+ uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
+ /* key_vec = xsecret[i]; */
+ uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
+ /* data_key = data_vec ^ key_vec; */
+ uint32x4_t data_key;
+
+ if (accWidth == XXH3p_acc_64bits) {
+ /* Add first to prevent register swaps */
+ /* xacc[i] += data_vec; */
+ xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+ } else { /* XXH3p_acc_128bits */
+ /* xacc[i] += swap(data_vec); */
+ /* can probably be optimized better */
+ uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+ uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+ xacc[i] = vaddq_u64 (xacc[i], swapped);
+ }
+
+ data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
+
+ /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
+ * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
+ __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
+ /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
+ xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
+
+#else
+ /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+
+ /* data_vec = xsecret[i]; */
+ uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
+ /* key_vec = xsecret[i]; */
+ uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
+ /* data_key = data_vec ^ key_vec; */
+ uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+ /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
+ uint32x2_t const data_key_lo = vmovn_u64 (data_key);
+ /* data_key_hi = (uint32x2_t) (data_key >> 32); */
+ uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
+ if (accWidth == XXH3p_acc_64bits) {
+ /* xacc[i] += data_vec; */
+ xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+ } else { /* XXH3p_acc_128bits */
+ /* xacc[i] += swap(data_vec); */
+ uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+ uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+ xacc[i] = vaddq_u64 (xacc[i], swapped);
+ }
+ /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+ xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+#endif
+ }
+ }
+
+#elif (XXH_VECTOR == XXH_VSX)
+ U64x2* const xacc = (U64x2*) acc; /* presumed aligned */
+ U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */
+ U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */
+ U64x2 const v32 = { 32, 32 };
+#if XXH_VSX_BE
+ U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+ 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+ size_t i;
+ for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+ /* data_vec = xinput[i]; */
+ /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+ /* byteswap */
+ U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
+ U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
+ /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
+ U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+ U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
+ U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
+ U64x2 const data_key = data_vec ^ key_vec;
+#endif
+ /* shuffled = (data_key << 32) | (data_key >> 32); */
+ U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
+ /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
+ U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
+ xacc[i] += product;
+
+ if (accWidth == XXH3p_acc_64bits) {
+ xacc[i] += data_vec;
+ } else { /* XXH3p_acc_128bits */
+ /* swap high and low halves */
+ U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+ xacc[i] += data_swapped;
+ }
+ }
+
+#else /* scalar variant of Accumulator - universal */
+
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+ const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
+ const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
+ size_t i;
+ XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+ for (i=0; i < ACC_NB; i++) {
+ xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+ xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+ if (accWidth == XXH3p_acc_64bits) {
+ xacc[i] += data_val;
+ } else {
+ xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+ }
+ xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+ }
+#endif
+}
+
+XXH_FORCE_INLINE void
+XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+ XXH_ASSERT((((size_t)acc) & 31) == 0);
+ { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+ const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
+ const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+ size_t i;
+ for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+ /* xacc[i] ^= (xacc[i] >> 47) */
+ __m256i const acc_vec = xacc[i];
+ __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
+ __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
+ /* xacc[i] ^= xsecret; */
+ __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
+ __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
+
+ /* xacc[i] *= PRIME32_1; */
+ __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
+ __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
+ __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
+ xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+ }
+ }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+ XXH_ASSERT((((size_t)acc) & 15) == 0);
+ { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+ const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
+ const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+ size_t i;
+ for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+ /* xacc[i] ^= (xacc[i] >> 47) */
+ __m128i const acc_vec = xacc[i];
+ __m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
+ __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
+ /* xacc[i] ^= xsecret; */
+ __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
+ __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
+
+ /* xacc[i] *= PRIME32_1; */
+ __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
+ __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
+ __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
+ xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+ }
+ }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+ XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+ { uint64x2_t* const xacc = (uint64x2_t*) acc;
+ uint8_t const* const xsecret = (uint8_t const*) secret;
+ uint32x2_t const prime = vdup_n_u32 (PRIME32_1);
+
+ size_t i;
+ for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+ /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
+ uint64x2_t const acc_vec = xacc[i];
+ uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47);
+ uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted);
+
+ /* key_vec = xsecret[i]; */
+ uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
+ /* data_key = data_vec ^ key_vec; */
+ uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec);
+ /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
+ uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key));
+
+ /* data_key *= PRIME32_1 */
+
+ /* prod_hi = (data_key >> 32) * PRIME32_1; */
+ uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime);
+ /* xacc[i] = prod_hi << 32; */
+ xacc[i] = vshlq_n_u64(prod_hi, 32);
+ /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+ xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
+ } }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+ U64x2* const xacc = (U64x2*) acc;
+ const U64x2* const xsecret = (const U64x2*) secret;
+ /* constants */
+ U64x2 const v32 = { 32, 32 };
+ U64x2 const v47 = { 47, 47 };
+ U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+ size_t i;
+#if XXH_VSX_BE
+ /* endian swap */
+ U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+ 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+ for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+ U64x2 const acc_vec = xacc[i];
+ U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+ /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+ /* swap bytes words */
+ U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
+ U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+ U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
+ U64x2 const data_key = data_vec ^ key_vec;
+#endif
+
+ /* data_key *= PRIME32_1 */
+
+ /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */
+ U64x2 const prod_even = XXH_vec_mule((U32x4)data_key, prime);
+ /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */
+ U64x2 const prod_odd = XXH_vec_mulo((U32x4)data_key, prime);
+ xacc[i] = prod_odd + (prod_even << v32);
+ }
+
+#else /* scalar variant of Scrambler - universal */
+
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+ const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
+ size_t i;
+ XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+ for (i=0; i < ACC_NB; i++) {
+ xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+ xxh_u64 acc64 = xacc[i];
+ acc64 ^= acc64 >> 47;
+ acc64 ^= key64;
+ acc64 *= PRIME32_1;
+ xacc[i] = acc64;
+ }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+/* assumption : nbStripes will not overflow secret size */
+XXH_FORCE_INLINE void
+XXH3p_accumulate( xxh_u64* XXH_RESTRICT acc,
+ const xxh_u8* XXH_RESTRICT input,
+ const xxh_u8* XXH_RESTRICT secret,
+ size_t nbStripes,
+ XXH3p_accWidth_e accWidth)
+{
+ size_t n;
+ for (n = 0; n < nbStripes; n++ ) {
+ const xxh_u8* const in = input + n*STRIPE_LEN;
+ XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+ XXH3p_accumulate_512(acc,
+ in,
+ secret + n*XXH_SECRET_CONSUME_RATE,
+ accWidth);
+ }
+}
+
+/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
+ * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
+ * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
+ * Pretty much every other modes and compilers prefer `FORCE_INLINE`.
+ */
+
+#if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
+static void
+#else
+XXH_FORCE_INLINE void
+#endif
+XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+ const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+ XXH3p_accWidth_e accWidth)
+{
+ size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+ size_t const block_len = STRIPE_LEN * nb_rounds;
+ size_t const nb_blocks = len / block_len;
+
+ size_t n;
+
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+
+ for (n = 0; n < nb_blocks; n++) {
+ XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+ XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+ }
+
+ /* last partial block */
+ XXH_ASSERT(len > STRIPE_LEN);
+ { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+ XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+ XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+ /* last stripe */
+ if (len & (STRIPE_LEN - 1)) {
+ const xxh_u8* const p = input + len - STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */
+ XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+ } }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+ return XXH3p_mul128_fold64(
+ acc[0] ^ XXH_readLE64(secret),
+ acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+ xxh_u64 result64 = start;
+
+ result64 += XXH3p_mix2Accs(acc+0, secret + 0);
+ result64 += XXH3p_mix2Accs(acc+2, secret + 16);
+ result64 += XXH3p_mix2Accs(acc+4, secret + 32);
+ result64 += XXH3p_mix2Accs(acc+6, secret + 48);
+
+ return XXH3p_avalanche(result64);
+}
+
+#define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+ PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
+
+ XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
+
+ /* converge into final hash */
+ XXH_STATIC_ASSERT(sizeof(acc) == 64);
+#define XXH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */
+ XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+ return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+
+XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+ return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+ return XXH3p_hashLong_internal(input, len, secret, secretSize);
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+ if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+ memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3p_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
+{
+ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+ int i;
+
+ XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+ for (i=0; i < nbRounds; i++) {
+ XXH_writeLE64(customSecret + 16*i, XXH_readLE64(kSecret + 16*i) + seed64);
+ XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
+ }
+}
+
+
+/* XXH3p_hashLong_64b_withSeed() :
+ * Generate a custom key,
+ * based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ */
+XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+ XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+ if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
+ XXH3p_initCustomSecret(secret, seed);
+ return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
+ const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+ xxh_u64 const input_lo = XXH_readLE64(input);
+ xxh_u64 const input_hi = XXH_readLE64(input+8);
+ return XXH3p_mul128_fold64(
+ input_lo ^ (XXH_readLE64(secret) + seed64),
+ input_hi ^ (XXH_readLE64(secret+8) - seed64) );
+}
+
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+ XXH64_hash_t seed)
+{
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+ XXH_ASSERT(16 < len && len <= 128);
+
+ { xxh_u64 acc = len * PRIME64_1;
+ if (len > 32) {
+ if (len > 64) {
+ if (len > 96) {
+ acc += XXH3p_mix16B(input+48, secret+96, seed);
+ acc += XXH3p_mix16B(input+len-64, secret+112, seed);
+ }
+ acc += XXH3p_mix16B(input+32, secret+64, seed);
+ acc += XXH3p_mix16B(input+len-48, secret+80, seed);
+ }
+ acc += XXH3p_mix16B(input+16, secret+32, seed);
+ acc += XXH3p_mix16B(input+len-32, secret+48, seed);
+ }
+ acc += XXH3p_mix16B(input+0, secret+0, seed);
+ acc += XXH3p_mix16B(input+len-16, secret+16, seed);
+
+ return XXH3p_avalanche(acc);
+ }
+}
+
+#define XXH3p_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+ XXH64_hash_t seed)
+{
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+ XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
+
+ #define XXH3p_MIDSIZE_STARTOFFSET 3
+ #define XXH3p_MIDSIZE_LASTOFFSET 17
+
+ { xxh_u64 acc = len * PRIME64_1;
+ int const nbRounds = (int)len / 16;
+ int i;
+ for (i=0; i<8; i++) {
+ acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
+ }
+ acc = XXH3p_avalanche(acc);
+ XXH_ASSERT(nbRounds >= 8);
+ for (i=8 ; i < nbRounds; i++) {
+ acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
+ }
+ /* last bytes */
+ acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
+ return XXH3p_avalanche(acc);
+ }
+}
+
+/* === Public entry point === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
+{
+ if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+ if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+ if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+ return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+ /* if an action must be taken should `secret` conditions not be respected,
+ * it should be done here.
+ * For now, it's a contract pre-condition.
+ * Adding a check and a branch here would cost performance at every hash */
+ if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+ if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+ if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+ return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+ if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+ if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+ if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+ return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* === XXH3 streaming === */
+
+XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void)
+{
+ return (XXH3p_state_t*)XXH_malloc(sizeof(XXH3p_state_t));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr)
+{
+ XXH_free(statePtr);
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state)
+{
+ memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3p_64bits_reset_internal(XXH3p_state_t* statePtr,
+ XXH64_hash_t seed,
+ const xxh_u8* secret, size_t secretSize)
+{
+ XXH_ASSERT(statePtr != NULL);
+ memset(statePtr, 0, sizeof(*statePtr));
+ statePtr->acc[0] = PRIME32_3;
+ statePtr->acc[1] = PRIME64_1;
+ statePtr->acc[2] = PRIME64_2;
+ statePtr->acc[3] = PRIME64_3;
+ statePtr->acc[4] = PRIME64_4;
+ statePtr->acc[5] = PRIME32_2;
+ statePtr->acc[6] = PRIME64_5;
+ statePtr->acc[7] = PRIME32_1;
+ statePtr->seed = seed;
+ XXH_ASSERT(secret != NULL);
+ statePtr->secret = secret;
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+ statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+ statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset(XXH3p_state_t* statePtr)
+{
+ if (statePtr == NULL) return XXH_ERROR;
+ XXH3p_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
+{
+ if (statePtr == NULL) return XXH_ERROR;
+ XXH3p_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+ if (secret == NULL) return XXH_ERROR;
+ if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
+{
+ if (statePtr == NULL) return XXH_ERROR;
+ XXH3p_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+ XXH3p_initCustomSecret(statePtr->customSecret, seed);
+ statePtr->secret = statePtr->customSecret;
+ return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3p_consumeStripes( xxh_u64* acc,
+ XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+ const xxh_u8* input, size_t totalStripes,
+ const xxh_u8* secret, size_t secretLimit,
+ XXH3p_accWidth_e accWidth)
+{
+ XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+ if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+ /* need a scrambling operation */
+ size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+ XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+ XXH3p_scrambleAcc(acc, secret + secretLimit);
+ XXH3p_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+ *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+ } else {
+ XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+ *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+ }
+}
+
+XXH_FORCE_INLINE XXH_errorcode
+XXH3p_update(XXH3p_state_t* state, const xxh_u8* input, size_t len, XXH3p_accWidth_e accWidth)
+{
+ if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+ return XXH_OK;
+#else
+ return XXH_ERROR;
+#endif
+
+ { const xxh_u8* const bEnd = input + len;
+
+ state->totalLen += len;
+
+ if (state->bufferedSize + len <= XXH3p_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
+ XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+ state->bufferedSize += (XXH32_hash_t)len;
+ return XXH_OK;
+ }
+ /* input now > XXH3p_INTERNALBUFFER_SIZE */
+
+ #define XXH3p_INTERNALBUFFER_STRIPES (XXH3p_INTERNALBUFFER_SIZE / STRIPE_LEN)
+ XXH_STATIC_ASSERT(XXH3p_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */
+
+ if (state->bufferedSize) { /* some input within internal buffer: fill then consume it */
+ size_t const loadSize = XXH3p_INTERNALBUFFER_SIZE - state->bufferedSize;
+ XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+ input += loadSize;
+ XXH3p_consumeStripes(state->acc,
+ &state->nbStripesSoFar, state->nbStripesPerBlock,
+ state->buffer, XXH3p_INTERNALBUFFER_STRIPES,
+ state->secret, state->secretLimit,
+ accWidth);
+ state->bufferedSize = 0;
+ }
+
+ /* consume input by full buffer quantities */
+ if (input+XXH3p_INTERNALBUFFER_SIZE <= bEnd) {
+ const xxh_u8* const limit = bEnd - XXH3p_INTERNALBUFFER_SIZE;
+ do {
+ XXH3p_consumeStripes(state->acc,
+ &state->nbStripesSoFar, state->nbStripesPerBlock,
+ input, XXH3p_INTERNALBUFFER_STRIPES,
+ state->secret, state->secretLimit,
+ accWidth);
+ input += XXH3p_INTERNALBUFFER_SIZE;
+ } while (input<=limit);
+ }
+
+ if (input < bEnd) { /* some remaining input input : buffer it */
+ XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+ state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+ }
+ }
+
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_update(XXH3p_state_t* state, const void* input, size_t len)
+{
+ return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3p_digest_long (XXH64_hash_t* acc, const XXH3p_state_t* state, XXH3p_accWidth_e accWidth)
+{
+ memcpy(acc, state->acc, sizeof(state->acc)); /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
+ if (state->bufferedSize >= STRIPE_LEN) {
+ size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+ XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+ XXH3p_consumeStripes(acc,
+ &nbStripesSoFar, state->nbStripesPerBlock,
+ state->buffer, totalNbStripes,
+ state->secret, state->secretLimit,
+ accWidth);
+ if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */
+ XXH3p_accumulate_512(acc,
+ state->buffer + state->bufferedSize - STRIPE_LEN,
+ state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+ accWidth);
+ }
+ } else { /* bufferedSize < STRIPE_LEN */
+ if (state->bufferedSize) { /* one last stripe */
+ xxh_u8 lastStripe[STRIPE_LEN];
+ size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+ memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+ memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+ XXH3p_accumulate_512(acc,
+ lastStripe,
+ state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+ accWidth);
+ } }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_digest (const XXH3p_state_t* state)
+{
+ if (state->totalLen > XXH3p_MIDSIZE_MAX) {
+ XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+ XXH3p_digest_long(acc, state, XXH3p_acc_64bits);
+ return XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+ }
+ /* len <= XXH3p_MIDSIZE_MAX : short code */
+ if (state->seed)
+ return XXH3p_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+ return XXH3p_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (=> XXH128)
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(input != NULL);
+ XXH_ASSERT(1 <= len && len <= 3);
+ XXH_ASSERT(secret != NULL);
+ { xxh_u8 const c1 = input[0];
+ xxh_u8 const c2 = input[len >> 1];
+ xxh_u8 const c3 = input[len - 1];
+ xxh_u32 const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
+ xxh_u32 const combinedh = XXH_swap32(combinedl);
+ xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret) + seed);
+ xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
+ xxh_u64 const mixedl = keyed_lo * PRIME64_1;
+ xxh_u64 const mixedh = keyed_hi * PRIME64_5;
+ XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
+ return h128;
+ }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(input != NULL);
+ XXH_ASSERT(secret != NULL);
+ XXH_ASSERT(4 <= len && len <= 8);
+ { xxh_u32 const input_lo = XXH_readLE32(input);
+ xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+ xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
+ xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
+ xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
+ xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
+ xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
+ xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
+ xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
+ xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
+ { XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
+ return h128;
+ } }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(input != NULL);
+ XXH_ASSERT(secret != NULL);
+ XXH_ASSERT(9 <= len && len <= 16);
+ { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
+ xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
+ XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
+ xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
+ m128.low64 += lenContrib;
+ m128.high64 += input_hi * PRIME64_1;
+ m128.low64 ^= (m128.high64 >> 32);
+ { XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+ h128.high64 += m128.high64 * PRIME64_2;
+ h128.low64 = XXH3p_avalanche(h128.low64);
+ h128.high64 = XXH3p_avalanche(h128.high64);
+ return h128;
+ } }
+}
+
+/* Assumption : `secret` size is >= 16
+ * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ XXH_ASSERT(len <= 16);
+ { if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
+ if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
+ if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
+ { XXH128_hash_t const h128 = { 0, 0 };
+ return h128;
+ } }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
+
+ XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
+
+ /* converge into final hash */
+ XXH_STATIC_ASSERT(sizeof(acc) == 64);
+ XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+ { xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+ xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
+ XXH128_hash_t const h128 = { low64, high64 };
+ return h128;
+ }
+}
+
+XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+ return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+ const xxh_u8* secret, size_t secretSize)
+{
+ return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+ XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+ if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
+ XXH3p_initCustomSecret(secret, seed);
+ return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
+{
+ acc.low64 += XXH3p_mix16B (input_1, secret+0, seed);
+ acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+ acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
+ acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+ return acc;
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+ XXH64_hash_t seed)
+{
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+ XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
+
+ { XXH128_hash_t acc;
+ int const nbRounds = (int)len / 32;
+ int i;
+ acc.low64 = len * PRIME64_1;
+ acc.high64 = 0;
+ for (i=0; i<4; i++) {
+ acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
+ }
+ acc.low64 = XXH3p_avalanche(acc.low64);
+ acc.high64 = XXH3p_avalanche(acc.high64);
+ XXH_ASSERT(nbRounds >= 4);
+ for (i=4 ; i < nbRounds; i++) {
+ acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
+ }
+ /* last bytes */
+ acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
+
+ { xxh_u64 const low64 = acc.low64 + acc.high64;
+ xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+ XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
+ return h128;
+ }
+ }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+ const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+ XXH64_hash_t seed)
+{
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+ XXH_ASSERT(16 < len && len <= 128);
+
+ { XXH128_hash_t acc;
+ acc.low64 = len * PRIME64_1;
+ acc.high64 = 0;
+ if (len > 32) {
+ if (len > 64) {
+ if (len > 96) {
+ acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+ }
+ acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+ }
+ acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+ }
+ acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+ { xxh_u64 const low64 = acc.low64 + acc.high64;
+ xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+ XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
+ return h128;
+ }
+ }
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
+{
+ if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+ if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+ if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+ return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+ XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+ /* if an action must be taken should `secret` conditions not be respected,
+ * it should be done here.
+ * For now, it's a contract pre-condition.
+ * Adding a check and a branch here would cost performance at every hash */
+ if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+ if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+ if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+ return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+ if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+ if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+ if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+ return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+ return XXH3p_128bits_withSeed(input, len, seed);
+}
+
+
+/* === XXH3 128-bit streaming === */
+
+/* all the functions are actually the same as for 64-bit streaming variant,
+ just the reset one is different (different initial acc values for 0,5,6,7),
+ and near the end of the digest function */
+
+static void
+XXH3p_128bits_reset_internal(XXH3p_state_t* statePtr,
+ XXH64_hash_t seed,
+ const xxh_u8* secret, size_t secretSize)
+{
+ XXH3p_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset(XXH3p_state_t* statePtr)
+{
+ if (statePtr == NULL) return XXH_ERROR;
+ XXH3p_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
+{
+ if (statePtr == NULL) return XXH_ERROR;
+ XXH3p_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+ if (secret == NULL) return XXH_ERROR;
+ if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
+{
+ if (statePtr == NULL) return XXH_ERROR;
+ XXH3p_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+ XXH3p_initCustomSecret(statePtr->customSecret, seed);
+ statePtr->secret = statePtr->customSecret;
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_update(XXH3p_state_t* state, const void* input, size_t len)
+{
+ return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* state)
+{
+ if (state->totalLen > XXH3p_MIDSIZE_MAX) {
+ XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+ XXH3p_digest_long(acc, state, XXH3p_acc_128bits);
+ XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+ { xxh_u64 const low64 = XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+ xxh_u64 const high64 = XXH3p_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
+ XXH128_hash_t const h128 = { low64, high64 };
+ return h128;
+ }
+ }
+ /* len <= XXH3p_MIDSIZE_MAX : short code */
+ if (state->seed)
+ return XXH3p_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+ return XXH3p_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h> /* memcmp */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+ /* note : XXH128_hash_t is compact, it has no padding byte */
+ return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1 > *h128_2
+ * <0 if *h128_1 < *h128_2
+ * =0 if *h128_1 == *h128_2 */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+ XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+ XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+ int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+ /* note : bets that, in most cases, hash values are different */
+ if (hcmp) return hcmp;
+ return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*====== Canonical representation ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+ XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+ if (XXH_CPU_LITTLE_ENDIAN) {
+ hash.high64 = XXH_swap64(hash.high64);
+ hash.low64 = XXH_swap64(hash.low64);
+ }
+ memcpy(dst, &hash.high64, sizeof(hash.high64));
+ memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+ XXH128_hash_t h;
+ h.high64 = XXH_readBE64(src);
+ h.low64 = XXH_readBE64(src->digest + 8);
+ return h;
+}
+
+
+
+#endif /* XXH3p_H */
diff --git a/src/rocksdb/util/xxhash.cc b/src/rocksdb/util/xxhash.cc
new file mode 100644
index 000000000..6620ae8b6
--- /dev/null
+++ b/src/rocksdb/util/xxhash.cc
@@ -0,0 +1,1160 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+/*
+* xxHash - Fast Hash algorithm
+* Copyright (C) 2012-2016, Yann Collet
+*
+* BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*
+* * Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* * Redistributions in binary form must reproduce the above
+* copyright notice, this list of conditions and the following disclaimer
+* in the documentation and/or other materials provided with the
+* distribution.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+* You can contact the author at :
+* - xxHash homepage: http://www.xxhash.com
+* - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/* since xxhash.c can be included (via XXH_INLINE_ALL),
+ * it's good practice to protect it with guard
+ * in case of multiples inclusions */
+#ifndef XXHASH_C_01393879
+#define XXHASH_C_01393879
+
+/* *************************************
+* Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ * It can generate buggy code on targets which do not support unaligned memory accesses.
+ * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
+# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+# define XXH_FORCE_MEMORY_ACCESS 2
+# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+ (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+# define XXH_FORCE_MEMORY_ACCESS 1
+# endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
+# define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+# define XXH_FORCE_ALIGN_CHECK 0
+# else
+# define XXH_FORCE_ALIGN_CHECK 1
+# endif
+#endif
+
+/*!XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang. */
+#ifndef XXH_REROLL
+# if defined(__OPTIMIZE_SIZE__)
+# define XXH_REROLL 1
+# else
+# define XXH_REROLL 0
+# endif
+#endif
+
+/* *************************************
+* Includes & Memory related functions
+***************************************/
+/*! Modify the local functions below should you wish to use some other memory routines
+* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void XXH_free (void* p) { free(p); }
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#include <limits.h> /* ULLONG_MAX */
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY
+#endif
+
+#include "xxhash.h"
+
+/* BEGIN RocksDB customizations */
+#include "util/util.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */
+/* END RocksDB customizations */
+
+/* *************************************
+* Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio */
+# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+# define XXH_FORCE_INLINE static __forceinline
+# define XXH_NO_INLINE static __declspec(noinline)
+#else
+# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
+# ifdef __GNUC__
+# define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+# define XXH_NO_INLINE static __attribute__((noinline))
+# else
+# define XXH_FORCE_INLINE static inline
+# define XXH_NO_INLINE static
+# endif
+# else
+# define XXH_FORCE_INLINE static
+# define XXH_NO_INLINE static
+# endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+* Debug
+***************************************/
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+# define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+# include <assert.h> /* note : can still be disabled with NDEBUG */
+# define XXH_ASSERT(c) assert(c)
+#else
+# define XXH_ASSERT(c) ((void)0)
+#endif
+
+/* note : use after variable declarations */
+#define XXH_STATIC_ASSERT(c) { enum { XXH_sa = 1/(int)(!!(c)) }; }
+
+
+/* *************************************
+* Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+ typedef uint8_t xxh_u8;
+#else
+ typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+
+/* === Memory access === */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+ xxh_u32 val;
+ memcpy(&val, memPtr, sizeof(val));
+ return val;
+}
+
+#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* === Endianess === */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+# if defined(_WIN32) /* Windows is always little endian */ \
+ || defined(__LITTLE_ENDIAN__) \
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+# define XXH_CPU_LITTLE_ENDIAN 1
+# elif defined(__BIG_ENDIAN__) \
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+# define XXH_CPU_LITTLE_ENDIAN 0
+# else
+static int XXH_isLittleEndian(void)
+{
+ const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; /* don't use static : performance detrimental */
+ return one.c[0];
+}
+# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
+# endif
+#endif
+
+
+
+
+/* ****************************************
+* Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
+# define XXH_rotl32 __builtin_rotateleft32
+# define XXH_rotl64 __builtin_rotateleft64
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+# define XXH_rotl32(x,r) _rotl(x,r)
+# define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER) /* Visual Studio */
+# define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+# define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+ return ((x << 24) & 0xff000000 ) |
+ ((x << 8) & 0x00ff0000 ) |
+ ((x >> 8) & 0x0000ff00 ) |
+ ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+* Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+ return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+ return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+ if (align==XXH_unaligned) {
+ return XXH_readLE32(ptr);
+ } else {
+ return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+ }
+}
+
+
+/* *************************************
+* Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+* 32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+ acc += input * PRIME32_2;
+ acc = XXH_rotl32(acc, 13);
+ acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+ /* UGLY HACK:
+ * This inline assembly hack forces acc into a normal register. This is the
+ * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop
+ * (pragmas and attributes don't work for some resason) without globally
+ * disabling SSE4.1.
+ *
+ * The reason we want to avoid vectorization is because despite working on
+ * 4 integers at a time, there are multiple factors slowing XXH32 down on
+ * SSE4:
+ * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
+ * making it slightly slower to multiply four integers at once compared to four
+ * integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
+ * still not worth it to go into SSE just to multiply unless doing a long operation.
+ *
+ * - Four instructions are required to rotate,
+ * movqda tmp, v // not required with VEX encoding
+ * pslld tmp, 13 // tmp <<= 13
+ * psrld v, 19 // x >>= 19
+ * por v, tmp // x |= tmp
+ * compared to one for scalar:
+ * roll v, 13 // reliably fast across the board
+ * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason
+ *
+ * - Instruction level parallelism is actually more beneficial here because the
+ * SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
+ * while v3 can multiply. SSE forces them to operate together.
+ *
+ * How this hack works:
+ * __asm__("" // Declare an assembly block but don't declare any instructions
+ * : // However, as an Input/Output Operand,
+ * "+r" // constrain a read/write operand (+) as a general purpose register (r).
+ * (acc) // and set acc as the operand
+ * );
+ *
+ * Because of the 'r', the compiler has promised that seed will be in a
+ * general purpose register and the '+' says that it will be 'read/write',
+ * so it has to assume it has changed. It is like volatile without all the
+ * loads and stores.
+ *
+ * Since the argument has to be in a normal register (not an SSE register),
+ * each time XXH32_round is called, it is impossible to vectorize. */
+ __asm__("" : "+r" (acc));
+#endif
+ return acc;
+}
+
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+ h32 ^= h32 >> 15;
+ h32 *= PRIME32_2;
+ h32 ^= h32 >> 13;
+ h32 *= PRIME32_3;
+ h32 ^= h32 >> 16;
+ return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1 \
+ h32 += (*ptr++) * PRIME32_5; \
+ h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4 \
+ h32 += XXH_get32bits(ptr) * PRIME32_3; \
+ ptr+=4; \
+ h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+ /* Compact rerolled version */
+ if (XXH_REROLL) {
+ len &= 15;
+ while (len >= 4) {
+ PROCESS4;
+ len -= 4;
+ }
+ while (len > 0) {
+ PROCESS1;
+ --len;
+ }
+ return XXH32_avalanche(h32);
+ } else {
+ switch(len&15) /* or switch(bEnd - p) */ {
+ case 12: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 8: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 4: PROCESS4;
+ return XXH32_avalanche(h32);
+
+ case 13: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 9: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 5: PROCESS4;
+ PROCESS1;
+ return XXH32_avalanche(h32);
+
+ case 14: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 10: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 6: PROCESS4;
+ PROCESS1;
+ PROCESS1;
+ return XXH32_avalanche(h32);
+
+ case 15: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 11: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 7: PROCESS4;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 3: PROCESS1;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 2: PROCESS1;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 1: PROCESS1;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 0: return XXH32_avalanche(h32);
+ }
+ XXH_ASSERT(0);
+ return h32; /* reaching this point is deemed impossible */
+ }
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+ const xxh_u8* bEnd = input + len;
+ xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+ if (input==NULL) {
+ len=0;
+ bEnd=input=(const xxh_u8*)(size_t)16;
+ }
+#endif
+
+ if (len>=16) {
+ const xxh_u8* const limit = bEnd - 15;
+ xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+ xxh_u32 v2 = seed + PRIME32_2;
+ xxh_u32 v3 = seed + 0;
+ xxh_u32 v4 = seed - PRIME32_1;
+
+ do {
+ v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+ v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+ v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+ v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+ } while (input < limit);
+
+ h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)
+ + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+ } else {
+ h32 = seed + PRIME32_5;
+ }
+
+ h32 += (xxh_u32)len;
+
+ return XXH32_finalize(h32, input, len&15, align);
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+ /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+ XXH32_state_t state;
+ XXH32_reset(&state, seed);
+ XXH32_update(&state, (const xxh_u8*)input, len);
+ return XXH32_digest(&state);
+
+#else
+
+ if (XXH_FORCE_ALIGN_CHECK) {
+ if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */
+ return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+ } }
+
+ return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*====== Hash streaming ======*/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+ return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+ XXH_free(statePtr);
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+ memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+ XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+ memset(&state, 0, sizeof(state));
+ state.v1 = seed + PRIME32_1 + PRIME32_2;
+ state.v2 = seed + PRIME32_2;
+ state.v3 = seed + 0;
+ state.v4 = seed - PRIME32_1;
+ /* do not write into reserved, planned to be removed in a future version */
+ memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+ return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+ if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+ return XXH_OK;
+#else
+ return XXH_ERROR;
+#endif
+
+ { const xxh_u8* p = (const xxh_u8*)input;
+ const xxh_u8* const bEnd = p + len;
+
+ state->total_len_32 += (XXH32_hash_t)len;
+ state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+ if (state->memsize + len < 16) { /* fill in tmp buffer */
+ XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+ state->memsize += (XXH32_hash_t)len;
+ return XXH_OK;
+ }
+
+ if (state->memsize) { /* some data left from previous update */
+ XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+ { const xxh_u32* p32 = state->mem32;
+ state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+ state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+ state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+ state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+ }
+ p += 16-state->memsize;
+ state->memsize = 0;
+ }
+
+ // uintptr_t casts added to avoid array-bounds error on
+ // some inlined calls
+ if ((uintptr_t)p <= (uintptr_t)bEnd - 16) {
+ const uintptr_t limit = (uintptr_t)bEnd - 16;
+ xxh_u32 v1 = state->v1;
+ xxh_u32 v2 = state->v2;
+ xxh_u32 v3 = state->v3;
+ xxh_u32 v4 = state->v4;
+
+ do {
+ v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+ v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+ v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+ v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+ } while ((uintptr_t)p <= limit);
+
+ state->v1 = v1;
+ state->v2 = v2;
+ state->v3 = v3;
+ state->v4 = v4;
+ }
+
+ if (p < bEnd) {
+ XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+ state->memsize = (unsigned)(bEnd-p);
+ }
+ }
+
+ return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+{
+ xxh_u32 h32;
+
+ if (state->large_len) {
+ h32 = XXH_rotl32(state->v1, 1)
+ + XXH_rotl32(state->v2, 7)
+ + XXH_rotl32(state->v3, 12)
+ + XXH_rotl32(state->v4, 18);
+ } else {
+ h32 = state->v3 /* == seed */ + PRIME32_5;
+ }
+
+ h32 += state->total_len_32;
+
+ return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*====== Canonical representation ======*/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+* The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+* These functions allow transformation of hash result into and from its canonical format.
+* This way, hash values can be written into a file or buffer, remaining comparable across different systems.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+ XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+ if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+ memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+ return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+* 64-bit hash functions
+*********************************************************************/
+
+/*====== Memory access ======*/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*! XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a performance gain
+ * on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit registers,
+ * and 64-bit arithmetic needs to be simulated, it isn't beneficial to unroll. The code becomes
+ * ridiculously large (the largest function in the binary on i386!), and rerolling it saves
+ * anywhere from 3kB to 20kB. It is also slightly faster because it fits into cache better
+ * and is more likely to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. */
+#ifndef XXH_REROLL_XXH64
+# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+ || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+ || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+ || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+ || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+ || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+# define XXH_REROLL_XXH64 1
+# else
+# define XXH_REROLL_XXH64 0
+# endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+ xxh_u64 val;
+ memcpy(&val, memPtr, sizeof(val));
+ return val;
+}
+
+#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER) /* Visual Studio */
+# define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+# define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+ return ((x << 56) & 0xff00000000000000ULL) |
+ ((x << 40) & 0x00ff000000000000ULL) |
+ ((x << 24) & 0x0000ff0000000000ULL) |
+ ((x << 8) & 0x000000ff00000000ULL) |
+ ((x >> 8) & 0x00000000ff000000ULL) |
+ ((x >> 24) & 0x0000000000ff0000ULL) |
+ ((x >> 40) & 0x000000000000ff00ULL) |
+ ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+ return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+ return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+ if (align==XXH_unaligned)
+ return XXH_readLE64(ptr);
+ else
+ return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*====== xxh64 ======*/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+ acc += input * PRIME64_2;
+ acc = XXH_rotl64(acc, 31);
+ acc *= PRIME64_1;
+ return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+ val = XXH64_round(0, val);
+ acc ^= val;
+ acc = acc * PRIME64_1 + PRIME64_4;
+ return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+ h64 ^= h64 >> 33;
+ h64 *= PRIME64_2;
+ h64 ^= h64 >> 29;
+ h64 *= PRIME64_3;
+ h64 ^= h64 >> 32;
+ return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64 \
+ h64 ^= (*ptr++) * PRIME64_5; \
+ h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64 \
+ h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \
+ ptr+=4; \
+ h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 { \
+ xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+ ptr+=8; \
+ h64 ^= k1; \
+ h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+ /* Rerolled version for 32-bit targets is faster and much smaller. */
+ if (XXH_REROLL || XXH_REROLL_XXH64) {
+ len &= 31;
+ while (len >= 8) {
+ PROCESS8_64;
+ len -= 8;
+ }
+ if (len >= 4) {
+ PROCESS4_64;
+ len -= 4;
+ }
+ while (len > 0) {
+ PROCESS1_64;
+ --len;
+ }
+ return XXH64_avalanche(h64);
+ } else {
+ switch(len & 31) {
+ case 24: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 16: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 8: PROCESS8_64;
+ return XXH64_avalanche(h64);
+
+ case 28: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 20: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 12: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 4: PROCESS4_64;
+ return XXH64_avalanche(h64);
+
+ case 25: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 17: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 9: PROCESS8_64;
+ PROCESS1_64;
+ return XXH64_avalanche(h64);
+
+ case 29: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 21: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 13: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 5: PROCESS4_64;
+ PROCESS1_64;
+ return XXH64_avalanche(h64);
+
+ case 26: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 18: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 10: PROCESS8_64;
+ PROCESS1_64;
+ PROCESS1_64;
+ return XXH64_avalanche(h64);
+
+ case 30: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 22: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 14: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 6: PROCESS4_64;
+ PROCESS1_64;
+ PROCESS1_64;
+ return XXH64_avalanche(h64);
+
+ case 27: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 19: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 11: PROCESS8_64;
+ PROCESS1_64;
+ PROCESS1_64;
+ PROCESS1_64;
+ return XXH64_avalanche(h64);
+
+ case 31: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 23: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 15: PROCESS8_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 7: PROCESS4_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 3: PROCESS1_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 2: PROCESS1_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 1: PROCESS1_64;
+ FALLTHROUGH_INTENDED;
+ /* fallthrough */
+ case 0: return XXH64_avalanche(h64);
+ }
+ }
+ /* impossible to reach */
+ XXH_ASSERT(0);
+ return 0; /* unreachable, but some compilers complain without it */
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+ const xxh_u8* bEnd = input + len;
+ xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+ if (input==NULL) {
+ len=0;
+ bEnd=input=(const xxh_u8*)(size_t)32;
+ }
+#endif
+
+ if (len>=32) {
+ const xxh_u8* const limit = bEnd - 32;
+ xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+ xxh_u64 v2 = seed + PRIME64_2;
+ xxh_u64 v3 = seed + 0;
+ xxh_u64 v4 = seed - PRIME64_1;
+
+ do {
+ v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+ v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+ v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+ v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+ } while (input<=limit);
+
+ h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+ h64 = XXH64_mergeRound(h64, v1);
+ h64 = XXH64_mergeRound(h64, v2);
+ h64 = XXH64_mergeRound(h64, v3);
+ h64 = XXH64_mergeRound(h64, v4);
+
+ } else {
+ h64 = seed + PRIME64_5;
+ }
+
+ h64 += (xxh_u64) len;
+
+ return XXH64_finalize(h64, input, len, align);
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+ /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+ XXH64_state_t state;
+ XXH64_reset(&state, seed);
+ XXH64_update(&state, (const xxh_u8*)input, len);
+ return XXH64_digest(&state);
+
+#else
+
+ if (XXH_FORCE_ALIGN_CHECK) {
+ if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */
+ return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+ } }
+
+ return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*====== Hash Streaming ======*/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+ return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+ XXH_free(statePtr);
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+ memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+ XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+ memset(&state, 0, sizeof(state));
+ state.v1 = seed + PRIME64_1 + PRIME64_2;
+ state.v2 = seed + PRIME64_2;
+ state.v3 = seed + 0;
+ state.v4 = seed - PRIME64_1;
+ /* do not write into reserved64, might be removed in a future version */
+ memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+ return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+ if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+ return XXH_OK;
+#else
+ return XXH_ERROR;
+#endif
+
+ { const xxh_u8* p = (const xxh_u8*)input;
+ const xxh_u8* const bEnd = p + len;
+
+ state->total_len += len;
+
+ if (state->memsize + len < 32) { /* fill in tmp buffer */
+ XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+ state->memsize += (xxh_u32)len;
+ return XXH_OK;
+ }
+
+ if (state->memsize) { /* tmp buffer is full */
+ XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+ state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+ state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+ state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+ state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+ p += 32-state->memsize;
+ state->memsize = 0;
+ }
+
+ // uintptr_t casts added to avoid array-bounds error on
+ // some inlined calls
+ if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) {
+ const uintptr_t limit = (uintptr_t)bEnd - 32;
+ xxh_u64 v1 = state->v1;
+ xxh_u64 v2 = state->v2;
+ xxh_u64 v3 = state->v3;
+ xxh_u64 v4 = state->v4;
+
+ do {
+ v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+ v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+ v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+ v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+ } while ((uintptr_t)p <= limit);
+
+ state->v1 = v1;
+ state->v2 = v2;
+ state->v3 = v3;
+ state->v4 = v4;
+ }
+
+ if (p < bEnd) {
+ XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+ state->memsize = (unsigned)(bEnd-p);
+ }
+ }
+
+ return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+ xxh_u64 h64;
+
+ if (state->total_len >= 32) {
+ xxh_u64 const v1 = state->v1;
+ xxh_u64 const v2 = state->v2;
+ xxh_u64 const v3 = state->v3;
+ xxh_u64 const v4 = state->v4;
+
+ h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+ h64 = XXH64_mergeRound(h64, v1);
+ h64 = XXH64_mergeRound(h64, v2);
+ h64 = XXH64_mergeRound(h64, v3);
+ h64 = XXH64_mergeRound(h64, v4);
+ } else {
+ h64 = state->v3 /*seed*/ + PRIME64_5;
+ }
+
+ h64 += (xxh_u64) state->total_len;
+
+ return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/*====== Canonical representation ======*/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+ XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+ if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+ memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+ return XXH_readBE64(src);
+}
+
+
+
+/* *********************************************************************
+* XXH3
+* New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3p.h" /* XXH3 preview for RocksDB */
+
+
+#endif /* XXH_NO_LONG_LONG */
+
+#endif /* XXHASH_C_01393879 */
diff --git a/src/rocksdb/util/xxhash.h b/src/rocksdb/util/xxhash.h
new file mode 100644
index 000000000..59d9b97d6
--- /dev/null
+++ b/src/rocksdb/util/xxhash.h
@@ -0,0 +1,598 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+/*
+ xxHash - Extremely Fast Hash algorithm
+ Header File
+ Copyright (C) 2012-2016, Yann Collet.
+
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name Speed Q.Score Author
+xxHash 5.4 GB/s 10
+CrapWow 3.2 GB/s 2 Andrew
+MumurHash 3a 2.7 GB/s 10 Austin Appleby
+SpookyHash 2.0 GB/s 10 Bob Jenkins
+SBox 1.4 GB/s 9 Bret Mulvey
+Lookup3 1.2 GB/s 9 Bob Jenkins
+SuperFastHash 1.2 GB/s 1 Paul Hsieh
+CityHash64 1.05 GB/s 10 Pike & Alakuijala
+FNV 0.55 GB/s 5 Fowler, Noll, Vo
+CRC32 0.43 GB/s # 9
+MD5-32 0.33 GB/s 10 Ronald L. Rivest
+SHA1-32 0.28 GB/s 10
+
+Note #: other CRC32 implementations can be over 40x faster than SMHasher's:
+http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name Speed on 64 bits Speed on 32 bits
+XXH64 13.8 GB/s 1.9 GB/s
+XXH32 6.8 GB/s 6.0 GB/s
+*/
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/* BEGIN RocksDB customizations */
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY 1 /* access experimental APIs like XXH3 */
+#endif
+#define XXH_NAMESPACE ROCKSDB_
+/* END RocksDB customizations */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************
+* Definitions
+******************************/
+#include <stddef.h> /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+ * API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * This build macro includes xxhash functions in `static` mode
+ * in order to inline them, and remove their symbol from the public list.
+ * Inlining offers great performance improvement on small keys,
+ * and dramatic ones when length is expressed as a compile-time constant.
+ * See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
+ * Methodology :
+ * #define XXH_INLINE_ALL
+ * #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ * It's not useful to compile and link it as a separate object.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+# ifndef XXH_STATIC_LINKING_ONLY
+# define XXH_STATIC_LINKING_ONLY
+# endif
+# if defined(__GNUC__)
+# define XXH_PUBLIC_API static __inline __attribute__((unused))
+# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# define XXH_PUBLIC_API static inline
+# elif defined(_MSC_VER)
+# define XXH_PUBLIC_API static __inline
+# else
+ /* this version may generate warnings for unused static functions */
+# define XXH_PUBLIC_API static
+# endif
+#else
+# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+# ifdef XXH_EXPORT
+# define XXH_PUBLIC_API __declspec(dllexport)
+# elif XXH_IMPORT
+# define XXH_PUBLIC_API __declspec(dllimport)
+# endif
+# else
+# define XXH_PUBLIC_API /* do nothing */
+# endif
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXH_NAMESPACE
+# define XXH_CAT(A,B) A##B
+# define XXH_NAME2(A,B) XXH_CAT(A,B)
+# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+* Version
+***************************************/
+#define XXH_VERSION_MAJOR 0
+#define XXH_VERSION_MINOR 7
+#define XXH_VERSION_RELEASE 2
+#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/*-**********************************************************************
+* 32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+ typedef uint32_t XXH32_hash_t;
+#else
+# include <limits.h>
+# if UINT_MAX == 0xFFFFFFFFUL
+ typedef unsigned int XXH32_hash_t;
+# else
+# if ULONG_MAX == 0xFFFFFFFFUL
+ typedef unsigned long XXH32_hash_t;
+# else
+# error "unsupported platform : need a 32-bit type"
+# endif
+# endif
+#endif
+
+/*! XXH32() :
+ Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+ The memory between input & input+length must be valid (allocated and read-accessible).
+ "seed" can be used to alter the result predictably.
+ Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*====== Streaming ======*/
+
+/*
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hash values later on, by invoking again XXH*_digest().
+ *
+ * When done, release the state, using XXH*_freeState().
+ */
+
+typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+
+/*====== Canonical representation ======*/
+
+/* Default return values from XXH functions are basic unsigned 32 and 64 bits.
+ * This the simplest and fastest format for further post-processing.
+ * However, this leaves open the question of what is the order of bytes,
+ * since little and big endian conventions will write the same number differently.
+ *
+ * The canonical representation settles this issue,
+ * by mandating big-endian convention,
+ * aka, the same convention as human-readable numbers (large digits first).
+ * When writing hash values to storage, sending them over a network, or printing them,
+ * it's highly recommended to use the canonical representation,
+ * to ensure portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values into and from canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+* 64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+ typedef uint64_t XXH64_hash_t;
+#else
+ /* the following type must have a width of 64-bit */
+ typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*! XXH64() :
+ Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
+ "seed" can be used to alter the result predictably.
+ This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
+*/
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+
+/*====== Streaming ======*/
+typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
+
+/*====== Canonical representation ======*/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif /* XXH_NO_LONG_LONG */
+
+
+
+#ifdef XXH_STATIC_LINKING_ONLY
+
+/* ================================================================================================
+ This section contains declarations which are not guaranteed to remain stable.
+ They may change in future versions, becoming incompatible with a different version of the library.
+ These declarations should only be used with static linking.
+ Never use them in association with dynamic linking !
+=================================================================================================== */
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+struct XXH32_state_s {
+ XXH32_hash_t total_len_32;
+ XXH32_hash_t large_len;
+ XXH32_hash_t v1;
+ XXH32_hash_t v2;
+ XXH32_hash_t v3;
+ XXH32_hash_t v4;
+ XXH32_hash_t mem32[4];
+ XXH32_hash_t memsize;
+ XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */
+}; /* typedef'd to XXH32_state_t */
+
+#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */
+struct XXH64_state_s {
+ XXH64_hash_t total_len;
+ XXH64_hash_t v1;
+ XXH64_hash_t v2;
+ XXH64_hash_t v3;
+ XXH64_hash_t v4;
+ XXH64_hash_t mem64[4];
+ XXH32_hash_t memsize;
+ XXH32_hash_t reserved32; /* required for padding anyway */
+ XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */
+}; /* typedef'd to XXH64_state_t */
+#endif /* XXH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+* XXH3
+* New experimental hash
+************************************************************************/
+#ifndef XXH_NO_LONG_LONG
+
+
+/* ============================================
+ * XXH3 is a new hash algorithm,
+ * featuring improved speed performance for both small and large inputs.
+ * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ * In general, expect XXH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact differences depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The XXH3 algorithm is still considered experimental.
+ * Produced results can still change between versions.
+ * Results produced by v0.7.x are not comparable with results from v0.7.y .
+ * It's nonetheless possible to use XXH3 for ephemeral data (local sessions),
+ * but avoid storing values in long-term storage for later reads.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of two 64-bits fields.
+ * That's because 128-bit values do not exist in C standard.
+ * Note that it means that, at byte level, result is not identical depending on endianess.
+ * However, at field level, they are identical on all platforms.
+ * The canonical representation solves the issue of identical byte-level representation across platforms,
+ * which is necessary for serialization.
+ * Q1 : Would there be a better representation for a 128-bit hash result ?
+ * Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ?
+ *
+ * - Prototype XXH128() : XXH128() uses the same arguments as XXH64(), for consistency.
+ * It means it maps to XXH3p_128bits_withSeed().
+ * This variant is slightly slower than XXH3p_128bits(),
+ * because the seed is now part of the algorithm, and can't be simplified.
+ * Is that a good idea ?
+ *
+ * - Seed type for XXH128() : currently, it's a single 64-bit value, like the 64-bit variant.
+ * It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
+ * But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
+ * Such a variant could either replace current one, or become an additional one.
+ * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ * Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXH128 ?
+ *
+ * - Result for len==0 : Currently, the result of hashing a zero-length input is always `0`.
+ * It seems okay as a return value when using "default" secret and seed.
+ * But is it still fine to return `0` when secret or seed are non-default ?
+ * Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
+ *
+ * - Consistency (1) : Streaming XXH128 uses an XXH3 state, which is the same state as XXH3p_64bits().
+ * It means a 128bit streaming loop must invoke the following symbols :
+ * XXH3p_createState(), XXH3p_128bits_reset(), XXH3p_128bits_update() (loop), XXH3p_128bits_digest(), XXH3p_freeState().
+ * Is that consistent enough ?
+ *
+ * - Consistency (2) : The canonical representation of `XXH3p_64bits` is provided by existing functions
+ * XXH64_canonicalFromHash(), and reverse operation XXH64_hashFromCanonical().
+ * As a mirror, canonical functions for XXH128_hash_t results generated by `XXH3p_128bits`
+ * are XXH128_canonicalFromHash() and XXH128_hashFromCanonical().
+ * Which means, `XXH3` doesn't appear in the names, because canonical functions operate on a type,
+ * independently of which algorithm was used to generate that type.
+ * Is that consistent enough ?
+ */
+
+#ifdef XXH_NAMESPACE
+# define XXH3p_64bits XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits)
+# define XXH3p_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSecret)
+# define XXH3p_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSeed)
+
+# define XXH3p_createState XXH_NAME2(XXH_NAMESPACE, XXH3p_createState)
+# define XXH3p_freeState XXH_NAME2(XXH_NAMESPACE, XXH3p_freeState)
+# define XXH3p_copyState XXH_NAME2(XXH_NAMESPACE, XXH3p_copyState)
+
+# define XXH3p_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset)
+# define XXH3p_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSeed)
+# define XXH3p_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSecret)
+# define XXH3p_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_update)
+# define XXH3p_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_digest)
+#endif
+
+/* XXH3p_64bits() :
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* data, size_t len);
+
+/* XXH3p_64bits_withSecret() :
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The secret *must* be large enough (>= XXH3p_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid repeating same character, or sequences of bytes,
+ * and especially avoid swathes of \0.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3p_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/* XXH3p_64bits_withSeed() :
+ * This variant generates on the fly a custom secret,
+ * based on the default secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * note : seed==0 produces same results as XXH3p_64bits() */
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */
+# include <stdalign.h>
+# define XXH_ALIGN(n) alignas(n)
+#elif defined(__GNUC__)
+# define XXH_ALIGN(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+# define XXH_ALIGN(n) __declspec(align(n))
+#else
+# define XXH_ALIGN(n) /* disabled */
+#endif
+
+typedef struct XXH3p_state_s XXH3p_state_t;
+
+#define XXH3p_SECRET_DEFAULT_SIZE 192 /* minimum XXH3p_SECRET_SIZE_MIN */
+#define XXH3p_INTERNALBUFFER_SIZE 256
+struct XXH3p_state_s {
+ XXH_ALIGN(64) XXH64_hash_t acc[8];
+ XXH_ALIGN(64) unsigned char customSecret[XXH3p_SECRET_DEFAULT_SIZE]; /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
+ XXH_ALIGN(64) unsigned char buffer[XXH3p_INTERNALBUFFER_SIZE];
+ XXH32_hash_t bufferedSize;
+ XXH32_hash_t nbStripesPerBlock;
+ XXH32_hash_t nbStripesSoFar;
+ XXH32_hash_t secretLimit;
+ XXH32_hash_t reserved32;
+ XXH32_hash_t reserved32_2;
+ XXH64_hash_t totalLen;
+ XXH64_hash_t seed;
+ XXH64_hash_t reserved64;
+ const unsigned char* secret; /* note : there is some padding after, due to alignment on 64 bytes */
+}; /* typedef'd to XXH3p_state_t */
+
+/* Streaming requires state maintenance.
+ * This operation costs memory and cpu.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer using one-shot functions whenever possible. */
+
+XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr);
+XXH_PUBLIC_API void XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state);
+
+
+/* XXH3p_64bits_reset() :
+ * initialize with default parameters.
+ * result will be equivalent to `XXH3p_64bits()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset(XXH3p_state_t* statePtr);
+/* XXH3p_64bits_reset_withSeed() :
+ * generate a custom secret from `seed`, and store it into state.
+ * digest will be equivalent to `XXH3p_64bits_withSeed()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
+/* XXH3p_64bits_reset_withSecret() :
+ * `secret` is referenced, and must outlive the hash streaming session.
+ * secretSize must be >= XXH3p_SECRET_SIZE_MIN.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_digest (const XXH3p_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+# define XXH3p_128bits XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits)
+# define XXH3p_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSeed)
+# define XXH3p_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSecret)
+
+# define XXH3p_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset)
+# define XXH3p_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSeed)
+# define XXH3p_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSecret)
+# define XXH3p_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_update)
+# define XXH3p_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_digest)
+
+# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+typedef struct {
+ XXH64_hash_t low64;
+ XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset(XXH3p_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* statePtr);
+
+
+/* Note : for better performance, following functions can be inlined,
+ * using XXH_INLINE_ALL */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/* This comparator is compatible with stdlib's qsort().
+ * return : >0 if *h128_1 > *h128_2
+ * <0 if *h128_1 < *h128_2
+ * =0 if *h128_1 == *h128_2 */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*====== Canonical representation ======*/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif /* XXH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+* XXH_INLINE_ALL
+************************************************************************/
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+# include "xxhash.cc" /* include xxhash function bodies as `static`, for inlining */
+#endif
+
+
+
+#endif /* XXH_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* XXHASH_H_5627135585666179 */