diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/rocksdb/util | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/util')
121 files changed, 25997 insertions, 0 deletions
diff --git a/src/rocksdb/util/aligned_buffer.h b/src/rocksdb/util/aligned_buffer.h new file mode 100644 index 00000000..2201b487 --- /dev/null +++ b/src/rocksdb/util/aligned_buffer.h @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <algorithm> +#include "port/port.h" + +namespace rocksdb { + +inline size_t TruncateToPageBoundary(size_t page_size, size_t s) { + s -= (s & (page_size - 1)); + assert((s % page_size) == 0); + return s; +} + +inline size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; +} + +inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; } + +// This class is to manage an aligned user +// allocated buffer for direct I/O purposes +// though can be used for any purpose. +class AlignedBuffer { + size_t alignment_; + std::unique_ptr<char[]> buf_; + size_t capacity_; + size_t cursize_; + char* bufstart_; + +public: + AlignedBuffer() + : alignment_(), + capacity_(0), + cursize_(0), + bufstart_(nullptr) { + } + + AlignedBuffer(AlignedBuffer&& o) ROCKSDB_NOEXCEPT { + *this = std::move(o); + } + + AlignedBuffer& operator=(AlignedBuffer&& o) ROCKSDB_NOEXCEPT { + alignment_ = std::move(o.alignment_); + buf_ = std::move(o.buf_); + capacity_ = std::move(o.capacity_); + cursize_ = std::move(o.cursize_); + bufstart_ = std::move(o.bufstart_); + return *this; + } + + AlignedBuffer(const AlignedBuffer&) = delete; + + AlignedBuffer& operator=(const AlignedBuffer&) = delete; + + static bool isAligned(const void* ptr, size_t alignment) { + return reinterpret_cast<uintptr_t>(ptr) % alignment == 0; + } + + static bool isAligned(size_t n, size_t alignment) { + return n % alignment == 0; + } + + size_t Alignment() const { + return alignment_; + } + + size_t Capacity() const { + return capacity_; + } + + size_t CurrentSize() const { + return cursize_; + } + + const char* BufferStart() const { + return bufstart_; + } + + char* BufferStart() { return bufstart_; } + + void Clear() { + cursize_ = 0; + } + + void Alignment(size_t alignment) { + assert(alignment > 0); + assert((alignment & (alignment - 1)) == 0); + alignment_ = alignment; + } + + // Allocates a new buffer and sets bufstart_ to the aligned first byte. + // requested_capacity: requested new buffer capacity. This capacity will be + // rounded up based on alignment. + // copy_data: Copy data from old buffer to new buffer. + // copy_offset: Copy data from this offset in old buffer. + // copy_len: Number of bytes to copy. + void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false, + uint64_t copy_offset = 0, size_t copy_len = 0) { + assert(alignment_ > 0); + assert((alignment_ & (alignment_ - 1)) == 0); + + copy_len = copy_len > 0 ? copy_len : cursize_; + if (copy_data && requested_capacity < copy_len) { + // If we are downsizing to a capacity that is smaller than the current + // data in the buffer. Ignore the request. + return; + } + + size_t new_capacity = Roundup(requested_capacity, alignment_); + char* new_buf = new char[new_capacity + alignment_]; + char* new_bufstart = reinterpret_cast<char*>( + (reinterpret_cast<uintptr_t>(new_buf) + (alignment_ - 1)) & + ~static_cast<uintptr_t>(alignment_ - 1)); + + if (copy_data) { + assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_); + memcpy(new_bufstart, bufstart_ + copy_offset, copy_len); + cursize_ = copy_len; + } else { + cursize_ = 0; + } + + bufstart_ = new_bufstart; + capacity_ = new_capacity; + buf_.reset(new_buf); + } + // Used for write + // Returns the number of bytes appended + size_t Append(const char* src, size_t append_size) { + size_t buffer_remaining = capacity_ - cursize_; + size_t to_copy = std::min(append_size, buffer_remaining); + + if (to_copy > 0) { + memcpy(bufstart_ + cursize_, src, to_copy); + cursize_ += to_copy; + } + return to_copy; + } + + size_t Read(char* dest, size_t offset, size_t read_size) const { + assert(offset < cursize_); + + size_t to_read = 0; + if(offset < cursize_) { + to_read = std::min(cursize_ - offset, read_size); + } + if (to_read > 0) { + memcpy(dest, bufstart_ + offset, to_read); + } + return to_read; + } + + /// Pad to alignment + void PadToAlignmentWith(int padding) { + size_t total_size = Roundup(cursize_, alignment_); + size_t pad_size = total_size - cursize_; + + if (pad_size > 0) { + assert((pad_size + cursize_) <= capacity_); + memset(bufstart_ + cursize_, padding, pad_size); + cursize_ += pad_size; + } + } + + void PadWith(size_t pad_size, int padding) { + assert((pad_size + cursize_) <= capacity_); + memset(bufstart_ + cursize_, padding, pad_size); + cursize_ += pad_size; + } + + // After a partial flush move the tail to the beginning of the buffer + void RefitTail(size_t tail_offset, size_t tail_size) { + if (tail_size > 0) { + memmove(bufstart_, bufstart_ + tail_offset, tail_size); + } + cursize_ = tail_size; + } + + // Returns place to start writing + char* Destination() { + return bufstart_ + cursize_; + } + + void Size(size_t cursize) { + cursize_ = cursize; + } +}; +} diff --git a/src/rocksdb/util/allocator.h b/src/rocksdb/util/allocator.h new file mode 100644 index 00000000..505d6ba2 --- /dev/null +++ b/src/rocksdb/util/allocator.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Abstract interface for allocating memory in blocks. This memory is freed +// when the allocator object is destroyed. See the Arena class for more info. + +#pragma once +#include <cerrno> +#include <cstddef> +#include "rocksdb/write_buffer_manager.h" + +namespace rocksdb { + +class Logger; + +class Allocator { + public: + virtual ~Allocator() {} + + virtual char* Allocate(size_t bytes) = 0; + virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) = 0; + + virtual size_t BlockSize() const = 0; +}; + +class AllocTracker { + public: + explicit AllocTracker(WriteBufferManager* write_buffer_manager); + ~AllocTracker(); + void Allocate(size_t bytes); + // Call when we're finished allocating memory so we can free it from + // the write buffer's limit. + void DoneAllocating(); + + void FreeMem(); + + bool is_freed() const { return write_buffer_manager_ == nullptr || freed_; } + + private: + WriteBufferManager* write_buffer_manager_; + std::atomic<size_t> bytes_allocated_; + bool done_allocating_; + bool freed_; + + // No copying allowed + AllocTracker(const AllocTracker&); + void operator=(const AllocTracker&); +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/arena.cc b/src/rocksdb/util/arena.cc new file mode 100644 index 00000000..d7799eb2 --- /dev/null +++ b/src/rocksdb/util/arena.cc @@ -0,0 +1,239 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include <malloc_np.h> +#else +#include <malloc.h> +#endif +#endif +#ifndef OS_WIN +#include <sys/mman.h> +#endif +#include <algorithm> +#include "port/port.h" +#include "rocksdb/env.h" +#include "util/logging.h" +#include "util/sync_point.h" + +namespace rocksdb { + +// MSVC complains that it is already defined since it is static in the header. +#ifndef _MSC_VER +const size_t Arena::kInlineSize; +#endif + +const size_t Arena::kMinBlockSize = 4096; +const size_t Arena::kMaxBlockSize = 2u << 30; +static const int kAlignUnit = alignof(max_align_t); + +size_t OptimizeBlockSize(size_t block_size) { + // Make sure block_size is in optimal range + block_size = std::max(Arena::kMinBlockSize, block_size); + block_size = std::min(Arena::kMaxBlockSize, block_size); + + // make sure block_size is the multiple of kAlignUnit + if (block_size % kAlignUnit != 0) { + block_size = (1 + block_size / kAlignUnit) * kAlignUnit; + } + + return block_size; +} + +Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size) + : kBlockSize(OptimizeBlockSize(block_size)), tracker_(tracker) { + assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && + kBlockSize % kAlignUnit == 0); + TEST_SYNC_POINT_CALLBACK("Arena::Arena:0", const_cast<size_t*>(&kBlockSize)); + alloc_bytes_remaining_ = sizeof(inline_block_); + blocks_memory_ += alloc_bytes_remaining_; + aligned_alloc_ptr_ = inline_block_; + unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; +#ifdef MAP_HUGETLB + hugetlb_size_ = huge_page_size; + if (hugetlb_size_ && kBlockSize > hugetlb_size_) { + hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + } +#else + (void)huge_page_size; +#endif + if (tracker_ != nullptr) { + tracker_->Allocate(kInlineSize); + } +} + +Arena::~Arena() { + if (tracker_ != nullptr) { + assert(tracker_->is_freed()); + tracker_->FreeMem(); + } + for (const auto& block : blocks_) { + delete[] block; + } + +#ifdef MAP_HUGETLB + for (const auto& mmap_info : huge_blocks_) { + if (mmap_info.addr_ == nullptr) { + continue; + } + auto ret = munmap(mmap_info.addr_, mmap_info.length_); + if (ret != 0) { + // TODO(sdong): Better handling + } + } +#endif +} + +char* Arena::AllocateFallback(size_t bytes, bool aligned) { + if (bytes > kBlockSize / 4) { + ++irregular_block_num; + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + return AllocateNewBlock(bytes); + } + + // We waste the remaining space in the current block. + size_t size = 0; + char* block_head = nullptr; +#ifdef MAP_HUGETLB + if (hugetlb_size_) { + size = hugetlb_size_; + block_head = AllocateFromHugePage(size); + } +#endif + if (!block_head) { + size = kBlockSize; + block_head = AllocateNewBlock(size); + } + alloc_bytes_remaining_ = size - bytes; + + if (aligned) { + aligned_alloc_ptr_ = block_head + bytes; + unaligned_alloc_ptr_ = block_head + size; + return block_head; + } else { + aligned_alloc_ptr_ = block_head; + unaligned_alloc_ptr_ = block_head + size - bytes; + return unaligned_alloc_ptr_; + } +} + +char* Arena::AllocateFromHugePage(size_t bytes) { +#ifdef MAP_HUGETLB + if (hugetlb_size_ == 0) { + return nullptr; + } + // Reserve space in `huge_blocks_` before calling `mmap`. + // Use `emplace_back()` instead of `reserve()` to let std::vector manage its + // own memory and do fewer reallocations. + // + // - If `emplace_back` throws, no memory leaks because we haven't called + // `mmap` yet. + // - If `mmap` throws, no memory leaks because the vector will be cleaned up + // via RAII. + huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */); + + void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE), + (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0); + + if (addr == MAP_FAILED) { + return nullptr; + } + huge_blocks_.back() = MmapInfo(addr, bytes); + blocks_memory_ += bytes; + if (tracker_ != nullptr) { + tracker_->Allocate(bytes); + } + return reinterpret_cast<char*>(addr); +#else + (void)bytes; + return nullptr; +#endif +} + +char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, + Logger* logger) { + assert((kAlignUnit & (kAlignUnit - 1)) == + 0); // Pointer size should be a power of 2 + +#ifdef MAP_HUGETLB + if (huge_page_size > 0 && bytes > 0) { + // Allocate from a huge page TBL table. + assert(logger != nullptr); // logger need to be passed in. + size_t reserved_size = + ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; + assert(reserved_size >= bytes); + + char* addr = AllocateFromHugePage(reserved_size); + if (addr == nullptr) { + ROCKS_LOG_WARN(logger, + "AllocateAligned fail to allocate huge TLB pages: %s", + strerror(errno)); + // fail back to malloc + } else { + return addr; + } + } +#else + (void)huge_page_size; + (void)logger; +#endif + + size_t current_mod = + reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1); + size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = aligned_alloc_ptr_ + slop; + aligned_alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returns aligned memory + result = AllocateFallback(bytes, true /* aligned */); + } + assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + // Reserve space in `blocks_` before allocating memory via new. + // Use `emplace_back()` instead of `reserve()` to let std::vector manage its + // own memory and do fewer reallocations. + // + // - If `emplace_back` throws, no memory leaks because we haven't called `new` + // yet. + // - If `new` throws, no memory leaks because the vector will be cleaned up + // via RAII. + blocks_.emplace_back(nullptr); + + char* block = new char[block_bytes]; + size_t allocated_size; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + allocated_size = malloc_usable_size(block); +#ifndef NDEBUG + // It's hard to predict what malloc_usable_size() returns. + // A callback can allow users to change the costed size. + std::pair<size_t*, size_t*> pair(&allocated_size, &block_bytes); + TEST_SYNC_POINT_CALLBACK("Arena::AllocateNewBlock:0", &pair); +#endif // NDEBUG +#else + allocated_size = block_bytes; +#endif // ROCKSDB_MALLOC_USABLE_SIZE + blocks_memory_ += allocated_size; + if (tracker_ != nullptr) { + tracker_->Allocate(allocated_size); + } + blocks_.back() = block; + return block; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/arena.h b/src/rocksdb/util/arena.h new file mode 100644 index 00000000..dc64154c --- /dev/null +++ b/src/rocksdb/util/arena.h @@ -0,0 +1,141 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Arena is an implementation of Allocator class. For a request of small size, +// it allocates a block with pre-defined block size. For a request of big +// size, it uses malloc to directly get the requested size. + +#pragma once +#ifndef OS_WIN +#include <sys/mman.h> +#endif +#include <cstddef> +#include <cerrno> +#include <vector> +#include <assert.h> +#include <stdint.h> +#include "util/allocator.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +class Arena : public Allocator { + public: + // No copying allowed + Arena(const Arena&) = delete; + void operator=(const Arena&) = delete; + + static const size_t kInlineSize = 2048; + static const size_t kMinBlockSize; + static const size_t kMaxBlockSize; + + // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the + // supported hugepage size of the system), block allocation will try huge + // page TLB first. If allocation fails, will fall back to normal case. + explicit Arena(size_t block_size = kMinBlockSize, + AllocTracker* tracker = nullptr, size_t huge_page_size = 0); + ~Arena(); + + char* Allocate(size_t bytes) override; + + // huge_page_size: if >0, will try to allocate from huage page TLB. + // The argument will be the size of the page size for huge page TLB. Bytes + // will be rounded up to multiple of the page size to allocate through mmap + // anonymous option with huge page on. The extra space allocated will be + // wasted. If allocation fails, will fall back to normal case. To enable it, + // need to reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt for details. + // huge page allocation can fail. In this case it will fail back to + // normal cases. The messages will be logged to logger. So when calling with + // huge_page_tlb_size > 0, we highly recommend a logger is passed in. + // Otherwise, the error message will be printed out to stderr directly. + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) override; + + // Returns an estimate of the total memory usage of data allocated + // by the arena (exclude the space allocated but not yet used for future + // allocations). + size_t ApproximateMemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + alloc_bytes_remaining_; + } + + size_t MemoryAllocatedBytes() const { return blocks_memory_; } + + size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; } + + // If an allocation is too big, we'll allocate an irregular block with the + // same size of that allocation. + size_t IrregularBlockNum() const { return irregular_block_num; } + + size_t BlockSize() const override { return kBlockSize; } + + bool IsInInlineBlock() const { + return blocks_.empty(); + } + + private: + char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t)))); + // Number of bytes allocated in one block + const size_t kBlockSize; + // Array of new[] allocated memory blocks + typedef std::vector<char*> Blocks; + Blocks blocks_; + + struct MmapInfo { + void* addr_; + size_t length_; + + MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {} + }; + std::vector<MmapInfo> huge_blocks_; + size_t irregular_block_num = 0; + + // Stats for current active block. + // For each block, we allocate aligned memory chucks from one end and + // allocate unaligned memory chucks from the other end. Otherwise the + // memory waste for alignment will be higher if we allocate both types of + // memory from one direction. + char* unaligned_alloc_ptr_ = nullptr; + char* aligned_alloc_ptr_ = nullptr; + // How many bytes left in currently active block? + size_t alloc_bytes_remaining_ = 0; + +#ifdef MAP_HUGETLB + size_t hugetlb_size_ = 0; +#endif // MAP_HUGETLB + char* AllocateFromHugePage(size_t bytes); + char* AllocateFallback(size_t bytes, bool aligned); + char* AllocateNewBlock(size_t block_bytes); + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_ = 0; + AllocTracker* tracker_; +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + unaligned_alloc_ptr_ -= bytes; + alloc_bytes_remaining_ -= bytes; + return unaligned_alloc_ptr_; + } + return AllocateFallback(bytes, false /* unaligned */); +} + +// check and adjust the block_size so that the return value is +// 1. in the range of [kMinBlockSize, kMaxBlockSize]. +// 2. the multiple of align unit. +extern size_t OptimizeBlockSize(size_t block_size); + +} // namespace rocksdb diff --git a/src/rocksdb/util/arena_test.cc b/src/rocksdb/util/arena_test.cc new file mode 100644 index 00000000..9dfc28ab --- /dev/null +++ b/src/rocksdb/util/arena_test.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +namespace { +const size_t kHugePageSize = 2 * 1024 * 1024; +} // namespace +class ArenaTest : public testing::Test {}; + +TEST_F(ArenaTest, Empty) { Arena arena0; } + +namespace { +bool CheckMemoryAllocated(size_t allocated, size_t expected) { + // The value returned by Arena::MemoryAllocatedBytes() may be greater than + // the requested memory. We choose a somewhat arbitrary upper bound of + // max_expected = expected * 1.1 to detect critical overallocation. + size_t max_expected = expected + expected / 10; + return allocated >= expected && allocated <= max_expected; +} + +void MemoryAllocatedBytesTest(size_t huge_page_size) { + const int N = 17; + size_t req_sz; // requested size + size_t bsz = 32 * 1024; // block size + size_t expected_memory_allocated; + + Arena arena(bsz, nullptr, huge_page_size); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 12 * 1024; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated = req_sz * N + Arena::kInlineSize; + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + expected_memory_allocated); + + arena.Allocate(Arena::kInlineSize - 1); + + // requested size < quarter of a block: + // allocate a block with the default size, then try to use unused part + // of the block. So one new block will be allocated for the first + // Allocate(99) call. All the remaining calls won't lead to new allocation. + req_sz = 99; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + if (huge_page_size) { + ASSERT_TRUE( + CheckMemoryAllocated(arena.MemoryAllocatedBytes(), + expected_memory_allocated + bsz) || + CheckMemoryAllocated(arena.MemoryAllocatedBytes(), + expected_memory_allocated + huge_page_size)); + } else { + expected_memory_allocated += bsz; + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + expected_memory_allocated); + } + + // requested size > size of a block: + // allocate requested size separately + expected_memory_allocated = arena.MemoryAllocatedBytes(); + req_sz = 8 * 1024 * 1024; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated += req_sz * N; + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + expected_memory_allocated); +} + +// Make sure we didn't count the allocate but not used memory space in +// Arena::ApproximateMemoryUsage() +static void ApproximateMemoryUsageTest(size_t huge_page_size) { + const size_t kBlockSize = 4096; + const size_t kEntrySize = kBlockSize / 8; + const size_t kZero = 0; + Arena arena(kBlockSize, nullptr, huge_page_size); + ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); + + // allocate inline bytes + const size_t kAlignUnit = alignof(max_align_t); + EXPECT_TRUE(arena.IsInInlineBlock()); + arena.AllocateAligned(kAlignUnit); + EXPECT_TRUE(arena.IsInInlineBlock()); + arena.AllocateAligned(Arena::kInlineSize / 2 - (2 * kAlignUnit)); + EXPECT_TRUE(arena.IsInInlineBlock()); + arena.AllocateAligned(Arena::kInlineSize / 2); + EXPECT_TRUE(arena.IsInInlineBlock()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - kAlignUnit); + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + Arena::kInlineSize); + + auto num_blocks = kBlockSize / kEntrySize; + + // first allocation + arena.AllocateAligned(kEntrySize); + EXPECT_FALSE(arena.IsInInlineBlock()); + auto mem_usage = arena.MemoryAllocatedBytes(); + if (huge_page_size) { + ASSERT_TRUE( + CheckMemoryAllocated(mem_usage, kBlockSize + Arena::kInlineSize) || + CheckMemoryAllocated(mem_usage, huge_page_size + Arena::kInlineSize)); + } else { + ASSERT_PRED2(CheckMemoryAllocated, mem_usage, + kBlockSize + Arena::kInlineSize); + } + auto usage = arena.ApproximateMemoryUsage(); + ASSERT_LT(usage, mem_usage); + for (size_t i = 1; i < num_blocks; ++i) { + arena.AllocateAligned(kEntrySize); + ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize); + EXPECT_FALSE(arena.IsInInlineBlock()); + usage = arena.ApproximateMemoryUsage(); + } + if (huge_page_size) { + ASSERT_TRUE(usage > mem_usage || + usage + huge_page_size - kBlockSize == mem_usage); + } else { + ASSERT_GT(usage, mem_usage); + } +} + +static void SimpleTest(size_t huge_page_size) { + std::vector<std::pair<size_t, char*>> allocated; + Arena arena(Arena::kMinBlockSize, nullptr, huge_page_size); + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) + ? rnd.Uniform(6000) + : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (unsigned int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.ApproximateMemoryUsage(), bytes); + if (i > N / 10) { + ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10); + } + } + for (unsigned int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (unsigned int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256)); + } + } +} +} // namespace + +TEST_F(ArenaTest, MemoryAllocatedBytes) { + MemoryAllocatedBytesTest(0); + MemoryAllocatedBytesTest(kHugePageSize); +} + +TEST_F(ArenaTest, ApproximateMemoryUsage) { + ApproximateMemoryUsageTest(0); + ApproximateMemoryUsageTest(kHugePageSize); +} + +TEST_F(ArenaTest, Simple) { + SimpleTest(0); + SimpleTest(kHugePageSize); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/auto_roll_logger.cc b/src/rocksdb/util/auto_roll_logger.cc new file mode 100644 index 00000000..ae6061ae --- /dev/null +++ b/src/rocksdb/util/auto_roll_logger.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "util/auto_roll_logger.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE +// -- AutoRollLogger +Status AutoRollLogger::ResetLogger() { + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); + status_ = env_->NewLogger(log_fname_, &logger_); + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger"); + + if (!status_.ok()) { + return status_; + } + + if (logger_->GetLogFileSize() == Logger::kDoNotSupportGetLogFileSize) { + status_ = Status::NotSupported( + "The underlying logger doesn't support GetLogFileSize()"); + } + if (status_.ok()) { + cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6); + ctime_ = cached_now; + cached_now_access_count = 0; + } + + return status_; +} + +void AutoRollLogger::RollLogFile() { + // This function is called when log is rotating. Two rotations + // can happen quickly (NowMicro returns same value). To not overwrite + // previous log file we increment by one micro second and try again. + uint64_t now = env_->NowMicros(); + std::string old_fname; + do { + old_fname = OldInfoLogFileName( + dbname_, now, db_absolute_path_, db_log_dir_); + now++; + } while (env_->FileExists(old_fname).ok()); + env_->RenameFile(log_fname_, old_fname); +} + +std::string AutoRollLogger::ValistToString(const char* format, + va_list args) const { + // Any log messages longer than 1024 will get truncated. + // The user is responsible for chopping longer messages into multi line log + static const int MAXBUFFERSIZE = 1024; + char buffer[MAXBUFFERSIZE]; + + int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args); + (void) count; + assert(count >= 0); + + return buffer; +} + +void AutoRollLogger::LogInternal(const char* format, ...) { + mutex_.AssertHeld(); + va_list args; + va_start(args, format); + logger_->Logv(format, args); + va_end(args); +} + +void AutoRollLogger::Logv(const char* format, va_list ap) { + assert(GetStatus().ok()); + + std::shared_ptr<Logger> logger; + { + MutexLock l(&mutex_); + if ((kLogFileTimeToRoll > 0 && LogExpired()) || + (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { + RollLogFile(); + Status s = ResetLogger(); + if (!s.ok()) { + // can't really log the error if creating a new LOG file failed + return; + } + + WriteHeaderInfo(); + } + + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + + // Another thread could have put a new Logger instance into logger_ by now. + // However, since logger is still hanging on to the previous instance + // (reference count is not zero), we don't have to worry about it being + // deleted while we are accessing it. + // Note that logv itself is not mutex protected to allow maximum concurrency, + // as thread safety should have been handled by the underlying logger. + logger->Logv(format, ap); +} + +void AutoRollLogger::WriteHeaderInfo() { + mutex_.AssertHeld(); + for (auto& header : headers_) { + LogInternal("%s", header.c_str()); + } +} + +void AutoRollLogger::LogHeader(const char* format, va_list args) { + // header message are to be retained in memory. Since we cannot make any + // assumptions about the data contained in va_list, we will retain them as + // strings + va_list tmp; + va_copy(tmp, args); + std::string data = ValistToString(format, tmp); + va_end(tmp); + + MutexLock l(&mutex_); + headers_.push_back(data); + + // Log the original message to the current log + logger_->Logv(format, args); +} + +bool AutoRollLogger::LogExpired() { + if (cached_now_access_count >= call_NowMicros_every_N_records_) { + cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6); + cached_now_access_count = 0; + } + + ++cached_now_access_count; + return cached_now >= ctime_ + kLogFileTimeToRoll; +} +#endif // !ROCKSDB_LITE + +Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr<Logger>* logger) { + if (options.info_log) { + *logger = options.info_log; + return Status::OK(); + } + + Env* env = options.env; + std::string db_absolute_path; + env->GetAbsolutePath(dbname, &db_absolute_path); + std::string fname = + InfoLogFileName(dbname, db_absolute_path, options.db_log_dir); + + env->CreateDirIfMissing(dbname); // In case it does not exist + // Currently we only support roll by time-to-roll and log size +#ifndef ROCKSDB_LITE + if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { + AutoRollLogger* result = new AutoRollLogger( + env, dbname, options.db_log_dir, options.max_log_file_size, + options.log_file_time_to_roll, options.info_log_level); + Status s = result->GetStatus(); + if (!s.ok()) { + delete result; + } else { + logger->reset(result); + } + return s; + } +#endif // !ROCKSDB_LITE + // Open a log file in the same directory as the db + env->RenameFile(fname, + OldInfoLogFileName(dbname, env->NowMicros(), db_absolute_path, + options.db_log_dir)); + auto s = env->NewLogger(fname, logger); + if (logger->get() != nullptr) { + (*logger)->SetInfoLogLevel(options.info_log_level); + } + return s; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/auto_roll_logger.h b/src/rocksdb/util/auto_roll_logger.h new file mode 100644 index 00000000..64fce4d6 --- /dev/null +++ b/src/rocksdb/util/auto_roll_logger.h @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include <list> +#include <string> + +#include "port/port.h" +#include "port/util_logger.h" +#include "util/filename.h" +#include "util/mutexlock.h" +#include "util/sync_point.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE +// Rolls the log file by size and/or time +class AutoRollLogger : public Logger { + public: + AutoRollLogger(Env* env, const std::string& dbname, + const std::string& db_log_dir, size_t log_max_size, + size_t log_file_time_to_roll, + const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : Logger(log_level), + dbname_(dbname), + db_log_dir_(db_log_dir), + env_(env), + status_(Status::OK()), + kMaxLogFileSize(log_max_size), + kLogFileTimeToRoll(log_file_time_to_roll), + cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)), + ctime_(cached_now), + cached_now_access_count(0), + call_NowMicros_every_N_records_(100), + mutex_() { + env->GetAbsolutePath(dbname, &db_absolute_path_); + log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); + RollLogFile(); + ResetLogger(); + } + + using Logger::Logv; + void Logv(const char* format, va_list ap) override; + + // Write a header entry to the log. All header information will be written + // again every time the log rolls over. + virtual void LogHeader(const char* format, va_list ap) override; + + // check if the logger has encountered any problem. + Status GetStatus() { + return status_; + } + + size_t GetLogFileSize() const override { + std::shared_ptr<Logger> logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + return logger->GetLogFileSize(); + } + + void Flush() override { + std::shared_ptr<Logger> logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + TEST_SYNC_POINT("AutoRollLogger::Flush:PinnedLogger"); + if (logger) { + logger->Flush(); + } + } + + virtual ~AutoRollLogger() { + if (logger_ && !closed_) { + logger_->Close(); + } + } + + void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) { + call_NowMicros_every_N_records_ = call_NowMicros_every_N_records; + } + + // Expose the log file path for testing purpose + std::string TEST_log_fname() const { + return log_fname_; + } + + uint64_t TEST_ctime() const { return ctime_; } + + protected: + // Implementation of Close() + virtual Status CloseImpl() override { + if (logger_) { + return logger_->Close(); + } else { + return Status::OK(); + } + } + + private: + bool LogExpired(); + Status ResetLogger(); + void RollLogFile(); + // Log message to logger without rolling + void LogInternal(const char* format, ...); + // Serialize the va_list to a string + std::string ValistToString(const char* format, va_list args) const; + // Write the logs marked as headers to the new log file + void WriteHeaderInfo(); + std::string log_fname_; // Current active info log's file name. + std::string dbname_; + std::string db_log_dir_; + std::string db_absolute_path_; + Env* env_; + std::shared_ptr<Logger> logger_; + // current status of the logger + Status status_; + const size_t kMaxLogFileSize; + const size_t kLogFileTimeToRoll; + // header information + std::list<std::string> headers_; + // to avoid frequent env->NowMicros() calls, we cached the current time + uint64_t cached_now; + uint64_t ctime_; + uint64_t cached_now_access_count; + uint64_t call_NowMicros_every_N_records_; + mutable port::Mutex mutex_; +}; +#endif // !ROCKSDB_LITE + +// Facade to craete logger automatically +Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr<Logger>* logger); + +} // namespace rocksdb diff --git a/src/rocksdb/util/auto_roll_logger_test.cc b/src/rocksdb/util/auto_roll_logger_test.cc new file mode 100644 index 00000000..ab9e0595 --- /dev/null +++ b/src/rocksdb/util/auto_roll_logger_test.cc @@ -0,0 +1,530 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE + +#include "util/auto_roll_logger.h" +#include <errno.h> +#include <sys/stat.h> +#include <algorithm> +#include <cmath> +#include <fstream> +#include <iostream> +#include <iterator> +#include <string> +#include <thread> +#include <vector> +#include "port/port.h" +#include "rocksdb/db.h" +#include "util/logging.h" +#include "util/sync_point.h" +#include "util/testharness.h" + +namespace rocksdb { +namespace { +class NoSleepEnv : public EnvWrapper { + public: + NoSleepEnv(Env* base) : EnvWrapper(base) {} + void SleepForMicroseconds(int micros) override { + fake_time_ += static_cast<uint64_t>(micros); + } + + uint64_t NowNanos() override { return fake_time_ * 1000; } + + uint64_t NowMicros() override { return fake_time_; } + + private: + uint64_t fake_time_ = 6666666666; +}; +} // namespace + +class AutoRollLoggerTest : public testing::Test { + public: + static void InitTestDb() { +#ifdef OS_WIN + // Replace all slashes in the path so windows CompSpec does not + // become confused + std::string testDir(kTestDir); + std::replace_if(testDir.begin(), testDir.end(), + [](char ch) { return ch == '/'; }, '\\'); + std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir; +#else + std::string deleteCmd = "rm -rf " + kTestDir; +#endif + ASSERT_TRUE(system(deleteCmd.c_str()) == 0); + Env::Default()->CreateDir(kTestDir); + } + + void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size, + const std::string& log_message); + void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time, + const std::string& log_message); + + static const std::string kSampleMessage; + static const std::string kTestDir; + static const std::string kLogFile; + static Env* default_env; +}; + +const std::string AutoRollLoggerTest::kSampleMessage( + "this is the message to be written to the log file!!"); +const std::string AutoRollLoggerTest::kTestDir( + test::PerThreadDBPath("db_log_test")); +const std::string AutoRollLoggerTest::kLogFile( + test::PerThreadDBPath("db_log_test") + "/LOG"); +Env* AutoRollLoggerTest::default_env = Env::Default(); + +// In this test we only want to Log some simple log message with +// no format. LogMessage() provides such a simple interface and +// avoids the [format-security] warning which occurs when you +// call ROCKS_LOG_INFO(logger, log_message) directly. +namespace { +void LogMessage(Logger* logger, const char* message) { + ROCKS_LOG_INFO(logger, "%s", message); +} + +void LogMessage(const InfoLogLevel log_level, Logger* logger, + const char* message) { + Log(log_level, logger, "%s", message); +} +} // namespace + +void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, + size_t log_max_size, + const std::string& log_message) { + logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + // measure the size of each message, which is supposed + // to be equal or greater than log_message.size() + LogMessage(logger, log_message.c_str()); + size_t message_size = logger->GetLogFileSize(); + size_t current_log_size = message_size; + + // Test the cases when the log file will not be rolled. + while (current_log_size + message_size < log_max_size) { + LogMessage(logger, log_message.c_str()); + current_log_size += message_size; + ASSERT_EQ(current_log_size, logger->GetLogFileSize()); + } + + // Now the log file will be rolled + LogMessage(logger, log_message.c_str()); + // Since rotation is checked before actual logging, we need to + // trigger the rotation by logging another message. + LogMessage(logger, log_message.c_str()); + + ASSERT_TRUE(message_size == logger->GetLogFileSize()); +} + +void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger, + size_t time, + const std::string& log_message) { + uint64_t expected_ctime; + uint64_t actual_ctime; + + uint64_t total_log_size; + EXPECT_OK(env->GetFileSize(kLogFile, &total_log_size)); + expected_ctime = logger->TEST_ctime(); + logger->SetCallNowMicrosEveryNRecords(0); + + // -- Write to the log for several times, which is supposed + // to be finished before time. + for (int i = 0; i < 10; ++i) { + env->SleepForMicroseconds(50000); + LogMessage(logger, log_message.c_str()); + EXPECT_OK(logger->GetStatus()); + // Make sure we always write to the same log file (by + // checking the create time); + + actual_ctime = logger->TEST_ctime(); + + // Also make sure the log size is increasing. + EXPECT_EQ(expected_ctime, actual_ctime); + EXPECT_GT(logger->GetLogFileSize(), total_log_size); + total_log_size = logger->GetLogFileSize(); + } + + // -- Make the log file expire + env->SleepForMicroseconds(static_cast<int>(time * 1000000)); + LogMessage(logger, log_message.c_str()); + + // At this time, the new log file should be created. + actual_ctime = logger->TEST_ctime(); + EXPECT_LT(expected_ctime, actual_ctime); + EXPECT_LT(logger->GetLogFileSize(), total_log_size); +} + +TEST_F(AutoRollLoggerTest, RollLogFileBySize) { + InitTestDb(); + size_t log_max_size = 1024 * 5; + + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0); + + RollLogFileBySizeTest(&logger, log_max_size, + kSampleMessage + ":RollLogFileBySize"); +} + +TEST_F(AutoRollLoggerTest, RollLogFileByTime) { + NoSleepEnv nse(Env::Default()); + + size_t time = 2; + size_t log_size = 1024 * 5; + + InitTestDb(); + // -- Test the existence of file during the server restart. + ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile)); + AutoRollLogger logger(&nse, kTestDir, "", log_size, time); + ASSERT_OK(default_env->FileExists(kLogFile)); + + RollLogFileByTimeTest(&nse, &logger, time, + kSampleMessage + ":RollLogFileByTime"); +} + +TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) { + // If only 'log_max_size' options is specified, then every time + // when rocksdb is restarted, a new empty log file will be created. + InitTestDb(); + // WORKAROUND: + // avoid complier's complaint of "comparison between signed + // and unsigned integer expressions" because literal 0 is + // treated as "singed". + size_t kZero = 0; + size_t log_size = 1024; + + AutoRollLogger* logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + + LogMessage(logger, kSampleMessage.c_str()); + ASSERT_GT(logger->GetLogFileSize(), kZero); + delete logger; + + // reopens the log file and an empty log file will be created. + logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + ASSERT_EQ(logger->GetLogFileSize(), kZero); + delete logger; +} + +TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { + size_t time = 2, log_max_size = 1024 * 5; + + InitTestDb(); + + NoSleepEnv nse(Env::Default()); + AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time); + + // Test the ability to roll by size + RollLogFileBySizeTest(&logger, log_max_size, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); + + // Test the ability to roll by Time + RollLogFileByTimeTest(&nse, &logger, time, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); +} + +#ifndef OS_WIN +// TODO: does not build for Windows because of PosixLogger use below. Need to +// port +TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { + DBOptions options; + NoSleepEnv nse(Env::Default()); + std::shared_ptr<Logger> logger; + + // Normal logger + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + ASSERT_TRUE(dynamic_cast<PosixLogger*>(logger.get())); + + // Only roll by size + InitTestDb(); + options.max_log_file_size = 1024; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + AutoRollLogger* auto_roll_logger = + dynamic_cast<AutoRollLogger*>(logger.get()); + ASSERT_TRUE(auto_roll_logger); + RollLogFileBySizeTest( + auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - size"); + + // Only roll by Time + options.env = &nse; + InitTestDb(); + options.max_log_file_size = 0; + options.log_file_time_to_roll = 2; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + auto_roll_logger = + dynamic_cast<AutoRollLogger*>(logger.get()); + RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - time"); + + // roll by both Time and size + InitTestDb(); + options.max_log_file_size = 1024 * 5; + options.log_file_time_to_roll = 2; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + auto_roll_logger = + dynamic_cast<AutoRollLogger*>(logger.get()); + RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - both"); + RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - both"); +} + +TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) { + DBOptions options; + std::shared_ptr<Logger> logger; + + InitTestDb(); + options.max_log_file_size = 1024 * 5; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + AutoRollLogger* auto_roll_logger = + dynamic_cast<AutoRollLogger*>(logger.get()); + ASSERT_TRUE(auto_roll_logger); + rocksdb::port::Thread flush_thread; + + // Notes: + // (1) Need to pin the old logger before beginning the roll, as rolling grabs + // the mutex, which would prevent us from accessing the old logger. This + // also marks flush_thread with AutoRollLogger::Flush:PinnedLogger. + // (2) Need to reset logger during PosixLogger::Flush() to exercise a race + // condition case, which is executing the flush with the pinned (old) + // logger after auto-roll logger has cut over to a new logger. + // (3) PosixLogger::Flush() happens in both threads but its SyncPoints only + // are enabled in flush_thread (the one pinning the old logger). + rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers( + {{"AutoRollLogger::Flush:PinnedLogger", + "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"}, + {"PosixLogger::Flush:Begin1", + "AutoRollLogger::ResetLogger:BeforeNewLogger"}, + {"AutoRollLogger::ResetLogger:AfterNewLogger", + "PosixLogger::Flush:Begin2"}}, + {{"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin1"}, + {"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin2"}}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + flush_thread = port::Thread ([&]() { auto_roll_logger->Flush(); }); + TEST_SYNC_POINT( + "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"); + RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":LogFlushWhileRolling"); + flush_thread.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // OS_WIN + +TEST_F(AutoRollLoggerTest, InfoLogLevel) { + InitTestDb(); + + size_t log_size = 8192; + size_t log_lines = 0; + // an extra-scope to force the AutoRollLogger to flush the log file when it + // becomes out of scope. + { + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + for (int log_level = InfoLogLevel::HEADER_LEVEL; + log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + for (int log_type = InfoLogLevel::DEBUG_LEVEL; + log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) { + // log messages with log level smaller than log_level will not be + // logged. + LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str()); + } + log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1; + } + for (int log_level = InfoLogLevel::HEADER_LEVEL; + log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + + // again, messages with level smaller than log_level will not be logged. + ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str()); + log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1; + } + } + std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); + size_t lines = std::count(std::istreambuf_iterator<char>(inFile), + std::istreambuf_iterator<char>(), '\n'); + ASSERT_EQ(log_lines, lines); + inFile.close(); +} + +TEST_F(AutoRollLoggerTest, Close) { + InitTestDb(); + + size_t log_size = 8192; + size_t log_lines = 0; + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + for (int log_level = InfoLogLevel::HEADER_LEVEL; + log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + for (int log_type = InfoLogLevel::DEBUG_LEVEL; + log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) { + // log messages with log level smaller than log_level will not be + // logged. + LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str()); + } + log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1; + } + for (int log_level = InfoLogLevel::HEADER_LEVEL; + log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + + // again, messages with level smaller than log_level will not be logged. + ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str()); + log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1; + } + ASSERT_EQ(logger.Close(), Status::OK()); + + std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); + size_t lines = std::count(std::istreambuf_iterator<char>(inFile), + std::istreambuf_iterator<char>(), '\n'); + ASSERT_EQ(log_lines, lines); + inFile.close(); +} + +// Test the logger Header function for roll over logs +// We expect the new logs creates as roll over to carry the headers specified +static std::vector<std::string> GetOldFileNames(const std::string& path) { + std::vector<std::string> ret; + + const std::string dirname = path.substr(/*start=*/0, path.find_last_of("/")); + const std::string fname = path.substr(path.find_last_of("/") + 1); + + std::vector<std::string> children; + Env::Default()->GetChildren(dirname, &children); + + // We know that the old log files are named [path]<something> + // Return all entities that match the pattern + for (auto& child : children) { + if (fname != child && child.find(fname) == 0) { + ret.push_back(dirname + "/" + child); + } + } + + return ret; +} + +// Return the number of lines where a given pattern was found in the file +static size_t GetLinesCount(const std::string& fname, + const std::string& pattern) { + std::stringstream ssbuf; + std::string line; + size_t count = 0; + + std::ifstream inFile(fname.c_str()); + ssbuf << inFile.rdbuf(); + + while (getline(ssbuf, line)) { + if (line.find(pattern) != std::string::npos) { + count++; + } + } + + return count; +} + +TEST_F(AutoRollLoggerTest, LogHeaderTest) { + static const size_t MAX_HEADERS = 10; + static const size_t LOG_MAX_SIZE = 1024 * 5; + static const std::string HEADER_STR = "Log header line"; + + // test_num == 0 -> standard call to Header() + // test_num == 1 -> call to Log() with InfoLogLevel::HEADER_LEVEL + for (int test_num = 0; test_num < 2; test_num++) { + + InitTestDb(); + + AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "", + LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0); + + if (test_num == 0) { + // Log some headers explicitly using Header() + for (size_t i = 0; i < MAX_HEADERS; i++) { + Header(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i); + } + } else if (test_num == 1) { + // HEADER_LEVEL should make this behave like calling Header() + for (size_t i = 0; i < MAX_HEADERS; i++) { + ROCKS_LOG_HEADER(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i); + } + } + + const std::string newfname = logger.TEST_log_fname(); + + // Log enough data to cause a roll over + int i = 0; + for (size_t iter = 0; iter < 2; iter++) { + while (logger.GetLogFileSize() < LOG_MAX_SIZE) { + Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i); + ++i; + } + + Info(&logger, "Rollover"); + } + + // Flush the log for the latest file + LogFlush(&logger); + + const auto oldfiles = GetOldFileNames(newfname); + + ASSERT_EQ(oldfiles.size(), (size_t) 2); + + for (auto& oldfname : oldfiles) { + // verify that the files rolled over + ASSERT_NE(oldfname, newfname); + // verify that the old log contains all the header logs + ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS); + } + } +} + +TEST_F(AutoRollLoggerTest, LogFileExistence) { + rocksdb::DB* db; + rocksdb::Options options; +#ifdef OS_WIN + // Replace all slashes in the path so windows CompSpec does not + // become confused + std::string testDir(kTestDir); + std::replace_if(testDir.begin(), testDir.end(), + [](char ch) { return ch == '/'; }, '\\'); + std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir; +#else + std::string deleteCmd = "rm -rf " + kTestDir; +#endif + ASSERT_EQ(system(deleteCmd.c_str()), 0); + options.max_log_file_size = 100 * 1024 * 1024; + options.create_if_missing = true; + ASSERT_OK(rocksdb::DB::Open(options, kTestDir, &db)); + ASSERT_OK(default_env->FileExists(kLogFile)); + delete db; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as AutoRollLogger is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/util/autovector.h b/src/rocksdb/util/autovector.h new file mode 100644 index 00000000..5843fa8a --- /dev/null +++ b/src/rocksdb/util/autovector.h @@ -0,0 +1,366 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <algorithm> +#include <cassert> +#include <initializer_list> +#include <iterator> +#include <stdexcept> +#include <vector> + +namespace rocksdb { + +#ifdef ROCKSDB_LITE +template <class T, size_t kSize = 8> +class autovector : public std::vector<T> { + using std::vector<T>::vector; +}; +#else +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * reserve()/shrink_to_fit() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template <class T, size_t kSize = 8> +class autovector { + public: + // General STL-style container member types. + typedef T value_type; + typedef typename std::vector<T>::difference_type difference_type; + typedef typename std::vector<T>::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + // This class is the base for regular/const iterator + template <class TAutoVector, class TValueType> + class iterator_impl { + public: + // -- iterator traits + typedef iterator_impl<TAutoVector, TValueType> self_type; + typedef TValueType value_type; + typedef TValueType& reference; + typedef TValueType* pointer; + typedef typename TAutoVector::difference_type difference_type; + typedef std::random_access_iterator_tag iterator_category; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect), index_(index) {}; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() {} + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // ++iterator + self_type& operator++() { + ++index_; + return *this; + } + + // iterator++ + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // --iterator + self_type& operator--() { + --index_; + return *this; + } + + // iterator-- + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) const { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) const { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + + const_reference operator*() const { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + + pointer operator->() { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + const_pointer operator->() const { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { return !(*this == other); } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + typedef iterator_impl<autovector, value_type> iterator; + typedef iterator_impl<const autovector, const value_type> const_iterator; + typedef std::reverse_iterator<iterator> reverse_iterator; + typedef std::reverse_iterator<const_iterator> const_reverse_iterator; + + autovector() : values_(reinterpret_cast<pointer>(buf_)) {} + + autovector(std::initializer_list<T> init_list) + : values_(reinterpret_cast<pointer>(buf_)) { + for (const T& item : init_list) { + push_back(item); + } + } + + ~autovector() { clear(); } + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { return num_stack_items_ + vect_.size(); } + + // resize does not guarantee anything about the contents of the newly + // available elements + void resize(size_type n) { + if (n > kSize) { + vect_.resize(n - kSize); + while (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_++])) value_type(); + } + num_stack_items_ = kSize; + } else { + vect_.clear(); + while (num_stack_items_ < n) { + new ((void*)(&values_[num_stack_items_++])) value_type(); + } + while (num_stack_items_ > n) { + values_[--num_stack_items_].~value_type(); + } + } + } + + bool empty() const { return size() == 0; } + + const_reference operator[](size_type n) const { + assert(n < size()); + if (n < kSize) { + return values_[n]; + } + return vect_[n - kSize]; + } + + reference operator[](size_type n) { + assert(n < size()); + if (n < kSize) { + return values_[n]; + } + return vect_[n - kSize]; + } + + const_reference at(size_type n) const { + assert(n < size()); + return (*this)[n]; + } + + reference at(size_type n) { + assert(n < size()); + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_])) value_type(); + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_])) value_type(); + values_[num_stack_items_++] = item; + } else { + vect_.push_back(item); + } + } + + template <class... Args> + void emplace_back(Args&&... args) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_++])) + value_type(std::forward<Args>(args)...); + } else { + vect_.emplace_back(std::forward<Args>(args)...); + } + } + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + values_[--num_stack_items_].~value_type(); + } + } + + void clear() { + while (num_stack_items_ > 0) { + values_[--num_stack_items_].~value_type(); + } + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { assign(other); } + + autovector& operator=(const autovector& other) { return assign(other); } + + // -- Iterator Operations + iterator begin() { return iterator(this, 0); } + + const_iterator begin() const { return const_iterator(this, 0); } + + iterator end() { return iterator(this, this->size()); } + + const_iterator end() const { return const_iterator(this, this->size()); } + + reverse_iterator rbegin() { return reverse_iterator(end()); } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { return reverse_iterator(begin()); } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + alignas(alignof( + value_type)) char buf_[kSize * + sizeof(value_type)]; // the first `kSize` items + pointer values_; + // used only if there are more than `kSize` items. + std::vector<T> vect_; +}; + +template <class T, size_t kSize> +autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) { + values_ = reinterpret_cast<pointer>(buf_); + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} +#endif // ROCKSDB_LITE +} // namespace rocksdb diff --git a/src/rocksdb/util/autovector_test.cc b/src/rocksdb/util/autovector_test.cc new file mode 100644 index 00000000..13299669 --- /dev/null +++ b/src/rocksdb/util/autovector_test.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <atomic> +#include <iostream> +#include <string> +#include <utility> + +#include "rocksdb/env.h" +#include "util/autovector.h" +#include "util/string_util.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using std::cout; +using std::endl; + +namespace rocksdb { + +class AutoVectorTest : public testing::Test {}; +const unsigned long kSize = 8; + +namespace { +template <class T> +void AssertAutoVectorOnlyInStack(autovector<T, kSize>* vec, bool result) { +#ifndef ROCKSDB_LITE + ASSERT_EQ(vec->only_in_stack(), result); +#else + (void) vec; + (void) result; +#endif // !ROCKSDB_LITE +} +} // namespace + +TEST_F(AutoVectorTest, PushBackAndPopBack) { + autovector<size_t, kSize> vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + AssertAutoVectorOnlyInStack(&vec, true); + } else { + AssertAutoVectorOnlyInStack(&vec, false); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + AssertAutoVectorOnlyInStack(&vec, false); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST_F(AutoVectorTest, EmplaceBack) { + typedef std::pair<size_t, std::string> ValType; + autovector<ValType, kSize> vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, ToString(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + AssertAutoVectorOnlyInStack(&vec, true); + } else { + AssertAutoVectorOnlyInStack(&vec, false); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(ToString(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + AssertAutoVectorOnlyInStack(&vec, false); +} + +TEST_F(AutoVectorTest, Resize) { + autovector<size_t, kSize> vec; + + vec.resize(kSize); + AssertAutoVectorOnlyInStack(&vec, true); + for (size_t i = 0; i < kSize; ++i) { + vec[i] = i; + } + + vec.resize(kSize * 2); + AssertAutoVectorOnlyInStack(&vec, false); + for (size_t i = 0; i < kSize; ++i) { + ASSERT_EQ(vec[i], i); + } + for (size_t i = 0; i < kSize; ++i) { + vec[i + kSize] = i; + } + + vec.resize(1); + ASSERT_EQ(1U, vec.size()); +} + +namespace { +void AssertEqual( + const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); +#ifndef ROCKSDB_LITE + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); +#endif // !ROCKSDB_LITE + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} +} // namespace + +TEST_F(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : { kSize / 2, kSize * 1000 }) { + autovector<size_t, kSize> vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector<size_t, kSize> other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector<size_t, kSize> other(vec); + AssertEqual(other, vec); + } + } +} + +TEST_F(AutoVectorTest, Iterators) { + autovector<std::string, kSize> vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(ToString(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast<size_t>(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); + } +} + +namespace { +std::vector<std::string> GetTestKeys(size_t size) { + std::vector<std::string> keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + rocksdb::ToString(index++); + } + return keys; +} +} // namespace + +template <class TVector> +void BenchmarkVectorCreationAndInsertion( + std::string name, size_t ops, size_t item_size, + const std::vector<typename TVector::value_type>& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while(ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template <class TVector> +size_t BenchmarkSequenceAccess(std::string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; +} + +// This test case only reports the performance between std::vector<std::string> +// and autovector<std::string>. We chose string for comparison because in most +// of our use cases we used std::vector<std::string>. +TEST_F(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion<std::vector<std::string>>( + "std::vector<std::string>", kOps, insertions, string_keys); + BenchmarkVectorCreationAndInsertion<autovector<std::string, kSize>>( + "autovector<std::string>", kOps, insertions, string_keys); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + std::vector<uint64_t> int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion<std::vector<uint64_t>>( + "std::vector<uint64_t>", kOps, insertions, int_keys); + BenchmarkVectorCreationAndInsertion<autovector<uint64_t, kSize>>( + "autovector<uint64_t>", kOps, insertions, int_keys + ); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + BenchmarkSequenceAccess<std::vector<std::string>>("std::vector", kOps, + elem_size); + BenchmarkSequenceAccess<autovector<std::string, kSize>>("autovector", kOps, + elem_size); + cout << "-----------------------------------" << endl; + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/bloom.cc b/src/rocksdb/util/bloom.cc new file mode 100644 index 00000000..9c05f710 --- /dev/null +++ b/src/rocksdb/util/bloom.cc @@ -0,0 +1,372 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +#include "rocksdb/slice.h" +#include "table/block_based_filter_block.h" +#include "table/full_filter_bits_builder.h" +#include "table/full_filter_block.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +class BlockBasedFilterBlockBuilder; +class FullFilterBlockBuilder; + +FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key, + const size_t num_probes) + : bits_per_key_(bits_per_key), num_probes_(num_probes) { + assert(bits_per_key_); + } + + FullFilterBitsBuilder::~FullFilterBitsBuilder() {} + + void FullFilterBitsBuilder::AddKey(const Slice& key) { + uint32_t hash = BloomHash(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } + } + + Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) { + uint32_t total_bits, num_lines; + char* data = ReserveSpace(static_cast<int>(hash_entries_.size()), + &total_bits, &num_lines); + assert(data); + + if (total_bits != 0 && num_lines != 0) { + for (auto h : hash_entries_) { + AddHash(h, data, num_lines, total_bits); + } + } + data[total_bits/8] = static_cast<char>(num_probes_); + EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines)); + + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); + + return Slice(data, total_bits / 8 + 5); + } + +uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_lines = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_lines an odd number to make sure more bits are involved + // when determining which block. + if (num_lines % 2 == 0) { + num_lines++; + } + return num_lines * (CACHE_LINE_SIZE * 8); +} + +uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { + assert(bits_per_key_); + if (num_entry != 0) { + uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_); + + *total_bits = GetTotalBitsForLocality(total_bits_tmp); + *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); + assert(*total_bits > 0 && *total_bits % 8 == 0); + } else { + // filter is empty, just leave space for metadata + *total_bits = 0; + *num_lines = 0; + } + + // Reserve space for Filter + uint32_t sz = *total_bits / 8; + sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + return sz; +} + +char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { + uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines); + char* data = new char[sz]; + memset(data, 0, sz); + return data; +} + +int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) { + assert(bits_per_key_); + assert(space > 0); + uint32_t dont_care1, dont_care2; + int high = (int) (space * 8 / bits_per_key_ + 1); + int low = 1; + int n = high; + for (; n >= low; n--) { + uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2); + if (sz <= space) { + break; + } + } + assert(n < high); // High should be an overestimation + return n; +} + +inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, + uint32_t num_lines, uint32_t total_bits) { +#ifdef NDEBUG + (void)total_bits; +#endif + assert(num_lines > 0 && total_bits > 0); + + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8); + + for (uint32_t i = 0; i < num_probes_; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + data[bitpos / 8] |= (1 << (bitpos % 8)); + + h += delta; + } +} + +namespace { +class FullFilterBitsReader : public FilterBitsReader { + public: + explicit FullFilterBitsReader(const Slice& contents) + : data_(const_cast<char*>(contents.data())), + data_len_(static_cast<uint32_t>(contents.size())), + num_probes_(0), + num_lines_(0), + log2_cache_line_size_(0) { + assert(data_); + GetFilterMeta(contents, &num_probes_, &num_lines_); + // Sanitize broken parameter + if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) { + num_lines_ = 0; + num_probes_ = 0; + } else if (num_lines_ != 0) { + while (true) { + uint32_t num_lines_at_curr_cache_size = + (data_len_ - 5) >> log2_cache_line_size_; + if (num_lines_at_curr_cache_size == 0) { + // The cache line size seems not a power of two. It's not supported + // and indicates a corruption so disable using this filter. + assert(false); + num_lines_ = 0; + num_probes_ = 0; + break; + } + if (num_lines_at_curr_cache_size == num_lines_) { + break; + } + ++log2_cache_line_size_; + } + } + } + + ~FullFilterBitsReader() override {} + + bool MayMatch(const Slice& entry) override { + if (data_len_ <= 5) { // remain same with original filter + return false; + } + // Other Error params, including a broken filter, regarded as match + if (num_probes_ == 0 || num_lines_ == 0) return true; + uint32_t hash = BloomHash(entry); + return HashMayMatch(hash, Slice(data_, data_len_), + num_probes_, num_lines_); + } + + private: + // Filter meta data + char* data_; + uint32_t data_len_; + size_t num_probes_; + uint32_t num_lines_; + uint32_t log2_cache_line_size_; + + // Get num_probes, and num_lines from filter + // If filter format broken, set both to 0. + void GetFilterMeta(const Slice& filter, size_t* num_probes, + uint32_t* num_lines); + + // "filter" contains the data appended by a preceding call to + // FilterBitsBuilder::Finish. This method must return true if the key was + // passed to FilterBitsBuilder::AddKey. This method may return true or false + // if the key was not on the list, but it should aim to return false with a + // high probability. + // + // hash: target to be checked + // filter: the whole filter, including meta data bytes + // num_probes: number of probes, read before hand + // num_lines: filter metadata, read before hand + // Before calling this function, need to ensure the input meta data + // is valid. + bool HashMayMatch(const uint32_t& hash, const Slice& filter, + const size_t& num_probes, const uint32_t& num_lines); + + // No Copy allowed + FullFilterBitsReader(const FullFilterBitsReader&); + void operator=(const FullFilterBitsReader&); +}; + +void FullFilterBitsReader::GetFilterMeta(const Slice& filter, + size_t* num_probes, uint32_t* num_lines) { + uint32_t len = static_cast<uint32_t>(filter.size()); + if (len <= 5) { + // filter is empty or broken + *num_probes = 0; + *num_lines = 0; + return; + } + + *num_probes = filter.data()[len - 5]; + *num_lines = DecodeFixed32(filter.data() + len - 4); +} + +bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash, + const Slice& filter, const size_t& num_probes, + const uint32_t& num_lines) { + uint32_t len = static_cast<uint32_t>(filter.size()); + if (len <= 5) return false; // remain the same with original filter + + // It is ensured the params are valid before calling it + assert(num_probes != 0); + assert(num_lines != 0 && (len - 5) % num_lines == 0); + const char* data = filter.data(); + + uint32_t h = hash; + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + // Left shift by an extra 3 to convert bytes to bits + uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3); + PREFETCH(&data[b / 8], 0 /* rw */, 1 /* locality */); + PREFETCH(&data[b / 8 + (1 << log2_cache_line_size_) - 1], 0 /* rw */, + 1 /* locality */); + + for (uint32_t i = 0; i < num_probes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h & ((1 << (log2_cache_line_size_ + 3)) - 1)); + if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + + h += delta; + } + + return true; +} + +// An implementation of filter policy +class BloomFilterPolicy : public FilterPolicy { + public: + explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder) + : bits_per_key_(bits_per_key), hash_func_(BloomHash), + use_block_based_builder_(use_block_based_builder) { + initialize(); + } + + ~BloomFilterPolicy() override {} + + const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + // Compute bloom filter size (in both bits and bytes) + size_t bits = n * bits_per_key_; + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + size_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes + char* array = &(*dst)[init_size]; + for (size_t i = 0; i < (size_t)n; i++) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + uint32_t h = hash_func_(keys[i]); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < num_probes_; j++) { + const uint32_t bitpos = h % bits; + array[bitpos/8] |= (1 << (bitpos % 8)); + h += delta; + } + } + } + + bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override { + const size_t len = bloom_filter.size(); + if (len < 2) return false; + + const char* array = bloom_filter.data(); + const size_t bits = (len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const size_t k = array[len-1]; + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + + uint32_t h = hash_func_(key); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k; j++) { + const uint32_t bitpos = h % bits; + if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; + h += delta; + } + return true; + } + + FilterBitsBuilder* GetFilterBitsBuilder() const override { + if (use_block_based_builder_) { + return nullptr; + } + + return new FullFilterBitsBuilder(bits_per_key_, num_probes_); + } + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { + return new FullFilterBitsReader(contents); + } + + // If choose to use block based builder + bool UseBlockBasedBuilder() { return use_block_based_builder_; } + + private: + size_t bits_per_key_; + size_t num_probes_; + uint32_t (*hash_func_)(const Slice& key); + + const bool use_block_based_builder_; + + void initialize() { + // We intentionally round down to reduce probing cost a little bit + num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (num_probes_ < 1) num_probes_ = 1; + if (num_probes_ > 30) num_probes_ = 30; + } +}; + +} // namespace + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, + bool use_block_based_builder) { + return new BloomFilterPolicy(bits_per_key, use_block_based_builder); +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/bloom_test.cc b/src/rocksdb/util/bloom_test.cc new file mode 100644 index 00000000..4b25e9b6 --- /dev/null +++ b/src/rocksdb/util/bloom_test.cc @@ -0,0 +1,317 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run this test... Skipping...\n"); + return 0; +} +#else + +#include <vector> + +#include "rocksdb/filter_policy.h" +#include "table/full_filter_bits_builder.h" +#include "util/arena.h" +#include "util/gflags_compat.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_int32(bits_per_key, 10, ""); + +namespace rocksdb { + +static const int kVerbose = 1; + +static Slice Key(int i, char* buffer) { + std::string s; + PutFixed32(&s, static_cast<uint32_t>(i)); + memcpy(buffer, s.c_str(), sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +class BloomTest : public testing::Test { + private: + const FilterPolicy* policy_; + std::string filter_; + std::vector<std::string> keys_; + + public: + BloomTest() : policy_( + NewBloomFilterPolicy(FLAGS_bits_per_key)) {} + + ~BloomTest() override { delete policy_; } + + void Reset() { + keys_.clear(); + filter_.clear(); + } + + void Add(const Slice& s) { + keys_.push_back(s.ToString()); + } + + void Build() { + std::vector<Slice> key_slices; + for (size_t i = 0; i < keys_.size(); i++) { + key_slices.push_back(Slice(keys_[i])); + } + filter_.clear(); + policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()), + &filter_); + keys_.clear(); + if (kVerbose >= 2) DumpFilter(); + } + + size_t FilterSize() const { + return filter_.size(); + } + + void DumpFilter() { + fprintf(stderr, "F("); + for (size_t i = 0; i+1 < filter_.size(); i++) { + const unsigned int c = static_cast<unsigned int>(filter_[i]); + for (int j = 0; j < 8; j++) { + fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.'); + } + } + fprintf(stderr, ")\n"); + } + + bool Matches(const Slice& s) { + if (!keys_.empty()) { + Build(); + } + return policy_->KeyMayMatch(s, filter_); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST_F(BloomTest, EmptyFilter) { + ASSERT_TRUE(! Matches("hello")); + ASSERT_TRUE(! Matches("world")); +} + +TEST_F(BloomTest, Small) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(! Matches("x")); + ASSERT_TRUE(! Matches("foo")); +} + +TEST_F(BloomTest, VaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast<int>(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often + else good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + +// Different bits-per-byte + +class FullBloomTest : public testing::Test { + private: + const FilterPolicy* policy_; + std::unique_ptr<FilterBitsBuilder> bits_builder_; + std::unique_ptr<FilterBitsReader> bits_reader_; + std::unique_ptr<const char[]> buf_; + size_t filter_size_; + + public: + FullBloomTest() : + policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)), + filter_size_(0) { + Reset(); + } + + ~FullBloomTest() override { delete policy_; } + + FullFilterBitsBuilder* GetFullFilterBitsBuilder() { + return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get()); + } + + void Reset() { + bits_builder_.reset(policy_->GetFilterBitsBuilder()); + bits_reader_.reset(nullptr); + buf_.reset(nullptr); + filter_size_ = 0; + } + + void Add(const Slice& s) { + bits_builder_->AddKey(s); + } + + void Build() { + Slice filter = bits_builder_->Finish(&buf_); + bits_reader_.reset(policy_->GetFilterBitsReader(filter)); + filter_size_ = filter.size(); + } + + size_t FilterSize() const { + return filter_size_; + } + + bool Matches(const Slice& s) { + if (bits_reader_ == nullptr) { + Build(); + } + return bits_reader_->MayMatch(s); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST_F(FullBloomTest, FilterSize) { + uint32_t dont_care1, dont_care2; + auto full_bits_builder = GetFullFilterBitsBuilder(); + for (int n = 1; n < 100; n++) { + auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2); + auto n2 = full_bits_builder->CalculateNumEntry(space); + ASSERT_GE(n2, n); + auto space2 = + full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2); + ASSERT_EQ(space, space2); + } +} + +TEST_F(FullBloomTest, FullEmptyFilter) { + // Empty filter is not match, at this level + ASSERT_TRUE(!Matches("hello")); + ASSERT_TRUE(!Matches("world")); +} + +TEST_F(FullBloomTest, FullSmall) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(!Matches("x")); + ASSERT_TRUE(!Matches("foo")); +} + +TEST_F(FullBloomTest, FullVaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast<int>(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + + return RUN_ALL_TESTS(); +} + +#endif // GFLAGS diff --git a/src/rocksdb/util/build_version.cc.in b/src/rocksdb/util/build_version.cc.in new file mode 100644 index 00000000..d2e8c578 --- /dev/null +++ b/src/rocksdb/util/build_version.cc.in @@ -0,0 +1,4 @@ +#include "build_version.h" +const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:@@GIT_SHA@@"; +const char* rocksdb_build_git_date = "rocksdb_build_git_date:@@GIT_DATE_TIME@@"; +const char* rocksdb_build_compile_date = __DATE__; diff --git a/src/rocksdb/util/build_version.h b/src/rocksdb/util/build_version.h new file mode 100644 index 00000000..36ff92c0 --- /dev/null +++ b/src/rocksdb/util/build_version.h @@ -0,0 +1,15 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#if !defined(IOS_CROSS_COMPILE) +// if we compile with Xcode, we don't run build_detect_version, so we don't +// generate these variables +// this variable tells us about the git revision +extern const char* rocksdb_build_git_sha; + +// Date on which the code was compiled: +extern const char* rocksdb_build_compile_date; +#endif diff --git a/src/rocksdb/util/cast_util.h b/src/rocksdb/util/cast_util.h new file mode 100644 index 00000000..2dc8138a --- /dev/null +++ b/src/rocksdb/util/cast_util.h @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace rocksdb { +// The helper function to assert the move from dynamic_cast<> to +// static_cast<> is correct. This function is to deal with legacy code. +// It is not recommanded to add new code to issue class casting. The preferred +// solution is to implement the functionality without a need of casting. +template <class DestClass, class SrcClass> +inline DestClass* static_cast_with_check(SrcClass* x) { + DestClass* ret = static_cast<DestClass*>(x); +#ifdef ROCKSDB_USE_RTTI + assert(ret == dynamic_cast<DestClass*>(x)); +#endif + return ret; +} +} // namespace rocksdb diff --git a/src/rocksdb/util/channel.h b/src/rocksdb/util/channel.h new file mode 100644 index 00000000..0225482c --- /dev/null +++ b/src/rocksdb/util/channel.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <condition_variable> +#include <mutex> +#include <queue> +#include <utility> + +namespace rocksdb { + +template <class T> +class channel { + public: + explicit channel() : eof_(false) {} + + channel(const channel&) = delete; + void operator=(const channel&) = delete; + + void sendEof() { + std::lock_guard<std::mutex> lk(lock_); + eof_ = true; + cv_.notify_all(); + } + + bool eof() { + std::lock_guard<std::mutex> lk(lock_); + return buffer_.empty() && eof_; + } + + size_t size() const { + std::lock_guard<std::mutex> lk(lock_); + return buffer_.size(); + } + + // writes elem to the queue + void write(T&& elem) { + std::unique_lock<std::mutex> lk(lock_); + buffer_.emplace(std::forward<T>(elem)); + cv_.notify_one(); + } + + /// Moves a dequeued element onto elem, blocking until an element + /// is available. + // returns false if EOF + bool read(T& elem) { + std::unique_lock<std::mutex> lk(lock_); + cv_.wait(lk, [&] { return eof_ || !buffer_.empty(); }); + if (eof_ && buffer_.empty()) { + return false; + } + elem = std::move(buffer_.front()); + buffer_.pop(); + cv_.notify_one(); + return true; + } + + private: + std::condition_variable cv_; + std::mutex lock_; + std::queue<T> buffer_; + bool eof_; +}; +} // namespace rocksdb diff --git a/src/rocksdb/util/coding.cc b/src/rocksdb/util/coding.cc new file mode 100644 index 00000000..b5cfac86 --- /dev/null +++ b/src/rocksdb/util/coding.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include <algorithm> +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +// conversion' conversion from 'type1' to 'type2', possible loss of data +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4244) +#endif +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast<char*>(ptr); +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast<const unsigned char*>(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast<const char*>(p); + } + } + return nullptr; +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast<const unsigned char*>(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast<const char*>(p); + } + } + return nullptr; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/coding.h b/src/rocksdb/util/coding.h new file mode 100644 index 00000000..4046a2b6 --- /dev/null +++ b/src/rocksdb/util/coding.h @@ -0,0 +1,455 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#pragma once +#include <algorithm> +#include <stdint.h> +#include <string.h> +#include <string> + +#include "rocksdb/write_batch.h" +#include "port/port.h" + +// Some processors does not allow unaligned access to memory +#if defined(__sparc) + #define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED +#endif + +namespace rocksdb { + +// The maximum length of a varint in bytes for 64-bit. +const unsigned int kMaxVarint64Length = 10; + +// Standard Put... routines append to a string +extern void PutFixed16(std::string* dst, uint16_t value); +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint32Varint32(std::string* dst, uint32_t value1, + uint32_t value2); +extern void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1, + uint32_t value2, uint32_t value3); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutVarint64Varint64(std::string* dst, uint64_t value1, + uint64_t value2); +extern void PutVarint32Varint64(std::string* dst, uint32_t value1, + uint64_t value2); +extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1, + uint32_t value2, uint64_t value3); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); +extern void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetFixed64(Slice* input, uint64_t* value); +extern bool GetFixed32(Slice* input, uint32_t* value); +extern bool GetFixed16(Slice* input, uint16_t* value); +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +// This function assumes data is well-formed. +extern Slice GetLengthPrefixedSlice(const char* data); + +extern Slice GetSliceUntil(Slice* slice, char delimiter); + +// Borrowed from +// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208 +constexpr inline uint64_t i64ToZigzag(const int64_t l) { + return (static_cast<uint64_t>(l) << 1) ^ static_cast<uint64_t>(l >> 63); +} +inline int64_t zigzagToI64(uint64_t n) { + return (n >> 1) ^ -static_cast<int64_t>(n & 1); +} + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// nullptr on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); +inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, + int64_t* value) { + uint64_t u = 0; + const char* ret = GetVarint64Ptr(p, limit, &u); + *value = zigzagToI64(u); + return ret; +} + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed16(char* dst, uint16_t value); +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint16_t DecodeFixed16(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint16_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) | + (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8)); + } +} + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) + | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast<const unsigned char*>(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +// -- Implementation of the functions declared above +inline void EncodeFixed16(char* buf, uint16_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + } +} + +inline void EncodeFixed32(char* buf, uint32_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + } +} + +inline void EncodeFixed64(char* buf, uint64_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; + } +} + +// Pull the last 8 bits and cast it to a character +inline void PutFixed16(std::string* dst, uint16_t value) { + if (port::kLittleEndian) { + dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed16(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutFixed32(std::string* dst, uint32_t value) { + if (port::kLittleEndian) { + dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutFixed64(std::string* dst, uint64_t value) { + if (port::kLittleEndian) { + dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline void PutVarint32Varint32(std::string* dst, uint32_t v1, uint32_t v2) { + char buf[10]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1, + uint32_t v2, uint32_t v3) { + char buf[15]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + ptr = EncodeVarint32(ptr, v3); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast<unsigned char*>(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast<unsigned char>(v); + return reinterpret_cast<char*>(ptr); +} + +inline void PutVarint64(std::string* dst, uint64_t v) { + char buf[kMaxVarint64Length]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline void PutVarsignedint64(std::string* dst, int64_t v) { + char buf[kMaxVarint64Length]; + // Using Zigzag format to convert signed to unsigned + char* ptr = EncodeVarint64(buf, i64ToZigzag(v)); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) { + char buf[20]; + char* ptr = EncodeVarint64(buf, v1); + ptr = EncodeVarint64(ptr, v2); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline void PutVarint32Varint64(std::string* dst, uint32_t v1, uint64_t v2) { + char buf[15]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint64(ptr, v2); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline void PutVarint32Varint32Varint64(std::string* dst, uint32_t v1, + uint32_t v2, uint64_t v3) { + char buf[20]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + ptr = EncodeVarint64(ptr, v3); + dst->append(buf, static_cast<size_t>(ptr - buf)); +} + +inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, static_cast<uint32_t>(value.size())); + dst->append(value.data(), value.size()); +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + size_t total_bytes = 0; + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, static_cast<uint32_t>(total_bytes)); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} + +inline int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +inline bool GetFixed64(Slice* input, uint64_t* value) { + if (input->size() < sizeof(uint64_t)) { + return false; + } + *value = DecodeFixed64(input->data()); + input->remove_prefix(sizeof(uint64_t)); + return true; +} + +inline bool GetFixed32(Slice* input, uint32_t* value) { + if (input->size() < sizeof(uint32_t)) { + return false; + } + *value = DecodeFixed32(input->data()); + input->remove_prefix(sizeof(uint32_t)); + return true; +} + +inline bool GetFixed16(Slice* input, uint16_t* value) { + if (input->size() < sizeof(uint16_t)) { + return false; + } + *value = DecodeFixed16(input->data()); + input->remove_prefix(sizeof(uint16_t)); + return true; +} + +inline bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast<size_t>(limit - q)); + return true; + } +} + +inline bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast<size_t>(limit - q)); + return true; + } +} + +// Provide an interface for platform independent endianness transformation +inline uint64_t EndianTransform(uint64_t input, size_t size) { + char* pos = reinterpret_cast<char*>(&input); + uint64_t ret_val = 0; + for (size_t i = 0; i < size; ++i) { + ret_val |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i])) + << ((size - i - 1) << 3)); + } + return ret_val; +} + +inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +inline Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len = 0; + // +5: we assume "data" is not corrupted + // unsigned char is 7 bits, uint32_t is 32 bits, need 5 unsigned char + auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len); + return Slice(p, len); +} + +inline Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len = 0; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + +template<class T> +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void PutUnaligned(T *memory, const T &value) { +#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) + char *nonAlignedMemory = reinterpret_cast<char*>(memory); + memcpy(nonAlignedMemory, reinterpret_cast<const char*>(&value), sizeof(T)); +#else + *memory = value; +#endif +} + +template<class T> +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void GetUnaligned(const T *memory, T *value) { +#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) + char *nonAlignedMemory = reinterpret_cast<char*>(value); + memcpy(nonAlignedMemory, reinterpret_cast<const char*>(memory), sizeof(T)); +#else + *value = *memory; +#endif +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/coding_test.cc b/src/rocksdb/util/coding_test.cc new file mode 100644 index 00000000..f7b1671d --- /dev/null +++ b/src/rocksdb/util/coding_test.cc @@ -0,0 +1,217 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class Coding { }; +TEST(Coding, Fixed16) { + std::string s; + for (uint16_t v = 0; v < 0xFFFF; v++) { + PutFixed16(&s, v); + } + + const char* p = s.data(); + for (uint16_t v = 0; v < 0xFFFF; v++) { + uint16_t actual = DecodeFixed16(p); + ASSERT_EQ(v, actual); + p += sizeof(uint16_t); + } +} + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast<uint64_t>(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast<uint64_t>(1) << power; + uint64_t actual = 0; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +// Test that encoding routines generate little-endian encodings +TEST(Coding, EncodingOutput) { + std::string dst; + PutFixed32(&dst, 0x04030201); + ASSERT_EQ(4U, dst.size()); + ASSERT_EQ(0x01, static_cast<int>(dst[0])); + ASSERT_EQ(0x02, static_cast<int>(dst[1])); + ASSERT_EQ(0x03, static_cast<int>(dst[2])); + ASSERT_EQ(0x04, static_cast<int>(dst[3])); + + dst.clear(); + PutFixed64(&dst, 0x0807060504030201ull); + ASSERT_EQ(8U, dst.size()); + ASSERT_EQ(0x01, static_cast<int>(dst[0])); + ASSERT_EQ(0x02, static_cast<int>(dst[1])); + ASSERT_EQ(0x03, static_cast<int>(dst[2])); + ASSERT_EQ(0x04, static_cast<int>(dst[3])); + ASSERT_EQ(0x05, static_cast<int>(dst[4])); + ASSERT_EQ(0x06, static_cast<int>(dst[5])); + ASSERT_EQ(0x07, static_cast<int>(dst[6])); + ASSERT_EQ(0x08, static_cast<int>(dst[7])); +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual = 0; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector<uint64_t> values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast<uint64_t>(0)); + values.push_back(~static_cast<uint64_t>(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (unsigned int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (unsigned int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual = 0; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/compaction_job_stats_impl.cc b/src/rocksdb/util/compaction_job_stats_impl.cc new file mode 100644 index 00000000..a1ebc8b9 --- /dev/null +++ b/src/rocksdb/util/compaction_job_stats_impl.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/compaction_job_stats.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE + +void CompactionJobStats::Reset() { + elapsed_micros = 0; + cpu_micros = 0; + + num_input_records = 0; + num_input_files = 0; + num_input_files_at_output_level = 0; + + num_output_records = 0; + num_output_files = 0; + + is_manual_compaction = 0; + + total_input_bytes = 0; + total_output_bytes = 0; + + num_records_replaced = 0; + + total_input_raw_key_bytes = 0; + total_input_raw_value_bytes = 0; + + num_input_deletion_records = 0; + num_expired_deletion_records = 0; + + num_corrupt_keys = 0; + + file_write_nanos = 0; + file_range_sync_nanos = 0; + file_fsync_nanos = 0; + file_prepare_write_nanos = 0; + + num_single_del_fallthru = 0; + num_single_del_mismatch = 0; +} + +void CompactionJobStats::Add(const CompactionJobStats& stats) { + elapsed_micros += stats.elapsed_micros; + cpu_micros += stats.cpu_micros; + + num_input_records += stats.num_input_records; + num_input_files += stats.num_input_files; + num_input_files_at_output_level += stats.num_input_files_at_output_level; + + num_output_records += stats.num_output_records; + num_output_files += stats.num_output_files; + + total_input_bytes += stats.total_input_bytes; + total_output_bytes += stats.total_output_bytes; + + num_records_replaced += stats.num_records_replaced; + + total_input_raw_key_bytes += stats.total_input_raw_key_bytes; + total_input_raw_value_bytes += stats.total_input_raw_value_bytes; + + num_input_deletion_records += stats.num_input_deletion_records; + num_expired_deletion_records += stats.num_expired_deletion_records; + + num_corrupt_keys += stats.num_corrupt_keys; + + file_write_nanos += stats.file_write_nanos; + file_range_sync_nanos += stats.file_range_sync_nanos; + file_fsync_nanos += stats.file_fsync_nanos; + file_prepare_write_nanos += stats.file_prepare_write_nanos; + + num_single_del_fallthru += stats.num_single_del_fallthru; + num_single_del_mismatch += stats.num_single_del_mismatch; +} + +#else + +void CompactionJobStats::Reset() {} + +void CompactionJobStats::Add(const CompactionJobStats& /*stats*/) {} + +#endif // !ROCKSDB_LITE + +} // namespace rocksdb diff --git a/src/rocksdb/util/comparator.cc b/src/rocksdb/util/comparator.cc new file mode 100644 index 00000000..b42c2372 --- /dev/null +++ b/src/rocksdb/util/comparator.cc @@ -0,0 +1,208 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <algorithm> +#include <memory> +#include <stdint.h> +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace rocksdb { + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + const char* Name() const override { return "leveldb.BytewiseComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + return a.compare(b); + } + + bool Equal(const Slice& a, const Slice& b) const override { return a == b; } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]); + uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]); + if (start_byte >= limit_byte) { + // Cannot shorten since limit is smaller than start or start is + // already the shortest possible. + return; + } + assert(start_byte < limit_byte); + + if (diff_index < limit.size() - 1 || start_byte + 1 < limit_byte) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + } else { + // v + // A A 1 A A A + // A A 2 + // + // Incrementing the current byte will make start bigger than limit, we + // will skip this byte, and find the first non 0xFF byte in start and + // increment it. + diff_index++; + + while (diff_index < start->size()) { + // Keep moving until we find the first non 0xFF byte to + // increment it + if (static_cast<uint8_t>((*start)[diff_index]) < + static_cast<uint8_t>(0xff)) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + break; + } + diff_index++; + } + } + assert(Compare(*start, limit) < 0); + } + } + + void FindShortSuccessor(std::string* key) const override { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast<uint8_t>(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } + + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + if (s.size() != t.size() || s.size() == 0) { + return false; + } + size_t diff_ind = s.difference_offset(t); + // same slice + if (diff_ind >= s.size()) return false; + uint8_t byte_s = static_cast<uint8_t>(s[diff_ind]); + uint8_t byte_t = static_cast<uint8_t>(t[diff_ind]); + // first different byte must be consecutive, and remaining bytes must be + // 0xff for s and 0x00 for t + if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) { + for (size_t i = diff_ind + 1; i < s.size(); ++i) { + byte_s = static_cast<uint8_t>(s[i]); + byte_t = static_cast<uint8_t>(t[i]); + if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) { + return false; + } + } + return true; + } else { + return false; + } + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return false; + } +}; + +class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { + public: + ReverseBytewiseComparatorImpl() { } + + const char* Name() const override { + return "rocksdb.ReverseBytewiseComparator"; + } + + int Compare(const Slice& a, const Slice& b) const override { + return -a.compare(b); + } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + assert(diff_index <= min_length); + if (diff_index == min_length) { + // Do not shorten if one string is a prefix of the other + // + // We could handle cases like: + // V + // A A 2 X Y + // A A 2 + // in a similar way as BytewiseComparator::FindShortestSeparator(). + // We keep it simple by not implementing it. We can come back to it + // later when needed. + } else { + uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]); + uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]); + if (start_byte > limit_byte && diff_index < start->size() - 1) { + // Case like + // V + // A A 3 A A + // A A 1 B B + // + // or + // v + // A A 2 A A + // A A 1 B B + // In this case "AA2" will be good. +#ifndef NDEBUG + std::string old_start = *start; +#endif + start->resize(diff_index + 1); +#ifndef NDEBUG + assert(old_start >= *start); +#endif + assert(Slice(*start).compare(limit) > 0); + } + } + } + + void FindShortSuccessor(std::string* /*key*/) const override { + // Don't do anything for simplicity. + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return false; + } +}; +}// namespace + +const Comparator* BytewiseComparator() { + static BytewiseComparatorImpl bytewise; + return &bytewise; +} + +const Comparator* ReverseBytewiseComparator() { + static ReverseBytewiseComparatorImpl rbytewise; + return &rbytewise; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/compression.h b/src/rocksdb/util/compression.h new file mode 100644 index 00000000..b901ceb3 --- /dev/null +++ b/src/rocksdb/util/compression.h @@ -0,0 +1,1347 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once + +#include <algorithm> +#include <limits> +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include <malloc_np.h> +#else // OS_FREEBSD +#include <malloc.h> +#endif // OS_FREEBSD +#endif // ROCKSDB_MALLOC_USABLE_SIZE +#include <string> + +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/memory_allocator.h" +#include "util/string_util.h" + +#ifdef SNAPPY +#include <snappy.h> +#endif + +#ifdef ZLIB +#include <zlib.h> +#endif + +#ifdef BZIP2 +#include <bzlib.h> +#endif + +#if defined(LZ4) +#include <lz4.h> +#include <lz4hc.h> +#endif + +#if defined(ZSTD) +#include <zstd.h> +#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ +#include <zdict.h> +#endif // ZSTD_VERSION_NUMBER >= 10103 +namespace rocksdb { +// Need this for the context allocation override +// On windows we need to do this explicitly +#if (ZSTD_VERSION_NUMBER >= 500) +#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \ + defined(ZSTD_STATIC_LINKING_ONLY) +#define ROCKSDB_ZSTD_CUSTOM_MEM +namespace port { +ZSTD_customMem GetJeZstdAllocationOverrides(); +} // namespace port +#endif // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && + // defined(ZSTD_STATIC_LINKING_ONLY) + +// We require `ZSTD_sizeof_DDict` and `ZSTD_createDDict_byReference` to use +// `ZSTD_DDict`. The former was introduced in v1.0.0 and the latter was +// introduced in v1.1.3. But an important bug fix for `ZSTD_sizeof_DDict` came +// in v1.1.4, so that is the version we require. As of today's latest version +// (v1.3.8), they are both still in the experimental API, which means they are +// only exported when the compiler flag `ZSTD_STATIC_LINKING_ONLY` is set. +#if defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104 +#define ROCKSDB_ZSTD_DDICT +#endif // defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104 + +// Cached data represents a portion that can be re-used +// If, in the future we have more than one native context to +// cache we can arrange this as a tuple +class ZSTDUncompressCachedData { + public: + using ZSTDNativeContext = ZSTD_DCtx*; + ZSTDUncompressCachedData() {} + // Init from cache + ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete; + ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete; + ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) ROCKSDB_NOEXCEPT + : ZSTDUncompressCachedData() { + *this = std::move(o); + } + ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o) + ROCKSDB_NOEXCEPT { + assert(zstd_ctx_ == nullptr); + std::swap(zstd_ctx_, o.zstd_ctx_); + std::swap(cache_idx_, o.cache_idx_); + return *this; + } + ZSTDNativeContext Get() const { return zstd_ctx_; } + int64_t GetCacheIndex() const { return cache_idx_; } + void CreateIfNeeded() { + if (zstd_ctx_ == nullptr) { +#ifdef ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = + ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides()); +#else // ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = ZSTD_createDCtx(); +#endif // ROCKSDB_ZSTD_CUSTOM_MEM + cache_idx_ = -1; + } + } + void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) { + zstd_ctx_ = o.zstd_ctx_; + cache_idx_ = idx; + } + ~ZSTDUncompressCachedData() { + if (zstd_ctx_ != nullptr && cache_idx_ == -1) { + ZSTD_freeDCtx(zstd_ctx_); + } + } + + private: + ZSTDNativeContext zstd_ctx_ = nullptr; + int64_t cache_idx_ = -1; // -1 means this instance owns the context +}; +#endif // (ZSTD_VERSION_NUMBER >= 500) +} // namespace rocksdb +#endif // ZSTD + +#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500) +namespace rocksdb { +class ZSTDUncompressCachedData { + void* padding; // unused + public: + using ZSTDNativeContext = void*; + ZSTDUncompressCachedData() {} + ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {} + ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete; + ZSTDUncompressCachedData(ZSTDUncompressCachedData&&) + ROCKSDB_NOEXCEPT = default; + ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&) + ROCKSDB_NOEXCEPT = default; + ZSTDNativeContext Get() const { return nullptr; } + int64_t GetCacheIndex() const { return -1; } + void CreateIfNeeded() {} + void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {} + private: + void ignore_padding__() { padding = nullptr; } +}; +} // namespace rocksdb +#endif + +#if defined(XPRESS) +#include "port/xpress.h" +#endif + +namespace rocksdb { + +// Holds dictionary and related data, like ZSTD's digested compression +// dictionary. +struct CompressionDict { +#if ZSTD_VERSION_NUMBER >= 700 + ZSTD_CDict* zstd_cdict_ = nullptr; +#endif // ZSTD_VERSION_NUMBER >= 700 + std::string dict_; + + public: +#if ZSTD_VERSION_NUMBER >= 700 + CompressionDict(std::string dict, CompressionType type, int level) { +#else // ZSTD_VERSION_NUMBER >= 700 + CompressionDict(std::string dict, CompressionType /*type*/, int /*level*/) { +#endif // ZSTD_VERSION_NUMBER >= 700 + dict_ = std::move(dict); +#if ZSTD_VERSION_NUMBER >= 700 + zstd_cdict_ = nullptr; + if (!dict_.empty() && (type == kZSTD || type == kZSTDNotFinalCompression)) { + if (level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } + // Should be safe (but slower) if below call fails as we'll use the + // raw dictionary to compress. + zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level); + assert(zstd_cdict_ != nullptr); + } +#endif // ZSTD_VERSION_NUMBER >= 700 + } + + ~CompressionDict() { +#if ZSTD_VERSION_NUMBER >= 700 + size_t res = 0; + if (zstd_cdict_ != nullptr) { + res = ZSTD_freeCDict(zstd_cdict_); + } + assert(res == 0); // Last I checked they can't fail + (void)res; // prevent unused var warning +#endif // ZSTD_VERSION_NUMBER >= 700 + } + +#if ZSTD_VERSION_NUMBER >= 700 + const ZSTD_CDict* GetDigestedZstdCDict() const { return zstd_cdict_; } +#endif // ZSTD_VERSION_NUMBER >= 700 + + Slice GetRawDict() const { return dict_; } + + static const CompressionDict& GetEmptyDict() { + static CompressionDict empty_dict{}; + return empty_dict; + } + + CompressionDict() = default; + // Disable copy/move + CompressionDict(const CompressionDict&) = delete; + CompressionDict& operator=(const CompressionDict&) = delete; + CompressionDict(CompressionDict&&) = delete; + CompressionDict& operator=(CompressionDict&&) = delete; +}; + +// Holds dictionary and related data, like ZSTD's digested uncompression +// dictionary. +struct UncompressionDict { +#ifdef ROCKSDB_ZSTD_DDICT + ZSTD_DDict* zstd_ddict_; +#endif // ROCKSDB_ZSTD_DDICT + // Block containing the data for the compression dictionary. It may be + // redundant with the data held in `zstd_ddict_`. + std::string dict_; + // This `Statistics` pointer is intended to be used upon block cache eviction, + // so only needs to be populated on `UncompressionDict`s that'll be inserted + // into block cache. + Statistics* statistics_; + +#ifdef ROCKSDB_ZSTD_DDICT + UncompressionDict(std::string dict, bool using_zstd, + Statistics* _statistics = nullptr) { +#else // ROCKSDB_ZSTD_DDICT + UncompressionDict(std::string dict, bool /*using_zstd*/, + Statistics* _statistics = nullptr) { +#endif // ROCKSDB_ZSTD_DDICT + dict_ = std::move(dict); + statistics_ = _statistics; +#ifdef ROCKSDB_ZSTD_DDICT + zstd_ddict_ = nullptr; + if (!dict_.empty() && using_zstd) { + zstd_ddict_ = ZSTD_createDDict_byReference(dict_.data(), dict_.size()); + assert(zstd_ddict_ != nullptr); + } +#endif // ROCKSDB_ZSTD_DDICT + } + + ~UncompressionDict() { +#ifdef ROCKSDB_ZSTD_DDICT + size_t res = 0; + if (zstd_ddict_ != nullptr) { + res = ZSTD_freeDDict(zstd_ddict_); + } + assert(res == 0); // Last I checked they can't fail + (void)res; // prevent unused var warning +#endif // ROCKSDB_ZSTD_DDICT + } + +#ifdef ROCKSDB_ZSTD_DDICT + const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; } +#endif // ROCKSDB_ZSTD_DDICT + + Slice GetRawDict() const { return dict_; } + + static const UncompressionDict& GetEmptyDict() { + static UncompressionDict empty_dict{}; + return empty_dict; + } + + Statistics* statistics() const { return statistics_; } + + size_t ApproximateMemoryUsage() { + size_t usage = 0; + usage += sizeof(struct UncompressionDict); +#ifdef ROCKSDB_ZSTD_DDICT + usage += ZSTD_sizeof_DDict(zstd_ddict_); +#endif // ROCKSDB_ZSTD_DDICT + usage += dict_.size(); + return usage; + } + + UncompressionDict() = default; + // Disable copy/move + UncompressionDict(const CompressionDict&) = delete; + UncompressionDict& operator=(const CompressionDict&) = delete; + UncompressionDict(CompressionDict&&) = delete; + UncompressionDict& operator=(CompressionDict&&) = delete; +}; + +class CompressionContext { + private: +#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500) + ZSTD_CCtx* zstd_ctx_ = nullptr; + void CreateNativeContext(CompressionType type) { + if (type == kZSTD || type == kZSTDNotFinalCompression) { +#ifdef ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = + ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides()); +#else // ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = ZSTD_createCCtx(); +#endif // ROCKSDB_ZSTD_CUSTOM_MEM + } + } + void DestroyNativeContext() { + if (zstd_ctx_ != nullptr) { + ZSTD_freeCCtx(zstd_ctx_); + } + } + + public: + // callable inside ZSTD_Compress + ZSTD_CCtx* ZSTDPreallocCtx() const { + assert(zstd_ctx_ != nullptr); + return zstd_ctx_; + } + +#else // ZSTD && (ZSTD_VERSION_NUMBER >= 500) + private: + void CreateNativeContext(CompressionType /* type */) {} + void DestroyNativeContext() {} +#endif // ZSTD && (ZSTD_VERSION_NUMBER >= 500) + public: + explicit CompressionContext(CompressionType type) { + CreateNativeContext(type); + } + ~CompressionContext() { DestroyNativeContext(); } + CompressionContext(const CompressionContext&) = delete; + CompressionContext& operator=(const CompressionContext&) = delete; +}; + +class CompressionInfo { + const CompressionOptions& opts_; + const CompressionContext& context_; + const CompressionDict& dict_; + const CompressionType type_; + const uint64_t sample_for_compression_; + + public: + CompressionInfo(const CompressionOptions& _opts, + const CompressionContext& _context, + const CompressionDict& _dict, CompressionType _type, + uint64_t _sample_for_compression) + : opts_(_opts), + context_(_context), + dict_(_dict), + type_(_type), + sample_for_compression_(_sample_for_compression) {} + + const CompressionOptions& options() const { return opts_; } + const CompressionContext& context() const { return context_; } + const CompressionDict& dict() const { return dict_; } + CompressionType type() const { return type_; } + uint64_t SampleForCompression() const { return sample_for_compression_; } +}; + +class UncompressionContext { + private: + CompressionContextCache* ctx_cache_ = nullptr; + ZSTDUncompressCachedData uncomp_cached_data_; + + public: + struct NoCache {}; + // Do not use context cache, used by TableBuilder + UncompressionContext(NoCache, CompressionType /* type */) {} + + explicit UncompressionContext(CompressionType type) { + if (type == kZSTD || type == kZSTDNotFinalCompression) { + ctx_cache_ = CompressionContextCache::Instance(); + uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData(); + } + } + ~UncompressionContext() { + if (uncomp_cached_data_.GetCacheIndex() != -1) { + assert(ctx_cache_ != nullptr); + ctx_cache_->ReturnCachedZSTDUncompressData( + uncomp_cached_data_.GetCacheIndex()); + } + } + UncompressionContext(const UncompressionContext&) = delete; + UncompressionContext& operator=(const UncompressionContext&) = delete; + + ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const { + return uncomp_cached_data_.Get(); + } +}; + +class UncompressionInfo { + const UncompressionContext& context_; + const UncompressionDict& dict_; + const CompressionType type_; + + public: + UncompressionInfo(const UncompressionContext& _context, + const UncompressionDict& _dict, CompressionType _type) + : context_(_context), dict_(_dict), type_(_type) {} + + const UncompressionContext& context() const { return context_; } + const UncompressionDict& dict() const { return dict_; } + CompressionType type() const { return type_; } +}; + +inline bool Snappy_Supported() { +#ifdef SNAPPY + return true; +#else + return false; +#endif +} + +inline bool Zlib_Supported() { +#ifdef ZLIB + return true; +#else + return false; +#endif +} + +inline bool BZip2_Supported() { +#ifdef BZIP2 + return true; +#else + return false; +#endif +} + +inline bool LZ4_Supported() { +#ifdef LZ4 + return true; +#else + return false; +#endif +} + +inline bool XPRESS_Supported() { +#ifdef XPRESS + return true; +#else + return false; +#endif +} + +inline bool ZSTD_Supported() { +#ifdef ZSTD + // ZSTD format is finalized since version 0.8.0. + return (ZSTD_versionNumber() >= 800); +#else + return false; +#endif +} + +inline bool ZSTDNotFinal_Supported() { +#ifdef ZSTD + return true; +#else + return false; +#endif +} + +inline bool CompressionTypeSupported(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return true; + case kSnappyCompression: + return Snappy_Supported(); + case kZlibCompression: + return Zlib_Supported(); + case kBZip2Compression: + return BZip2_Supported(); + case kLZ4Compression: + return LZ4_Supported(); + case kLZ4HCCompression: + return LZ4_Supported(); + case kXpressCompression: + return XPRESS_Supported(); + case kZSTDNotFinalCompression: + return ZSTDNotFinal_Supported(); + case kZSTD: + return ZSTD_Supported(); + default: + assert(false); + return false; + } +} + +inline std::string CompressionTypeToString(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return "NoCompression"; + case kSnappyCompression: + return "Snappy"; + case kZlibCompression: + return "Zlib"; + case kBZip2Compression: + return "BZip2"; + case kLZ4Compression: + return "LZ4"; + case kLZ4HCCompression: + return "LZ4HC"; + case kXpressCompression: + return "Xpress"; + case kZSTD: + return "ZSTD"; + case kZSTDNotFinalCompression: + return "ZSTDNotFinal"; + default: + assert(false); + return ""; + } +} + +inline std::string CompressionOptionsToString( + CompressionOptions& compression_options) { + std::string result; + result.reserve(512); + result.append("window_bits=") + .append(ToString(compression_options.window_bits)) + .append("; "); + result.append("level=") + .append(ToString(compression_options.level)) + .append("; "); + result.append("strategy=") + .append(ToString(compression_options.strategy)) + .append("; "); + result.append("max_dict_bytes=") + .append(ToString(compression_options.max_dict_bytes)) + .append("; "); + result.append("zstd_max_train_bytes=") + .append(ToString(compression_options.zstd_max_train_bytes)) + .append("; "); + result.append("enabled=") + .append(ToString(compression_options.enabled)) + .append("; "); + return result; +} + +// compress_format_version can have two values: +// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed +// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent +// way. +// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the +// start of compressed block. Snappy format is the same as version 1. + +inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input, + size_t length, ::std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(length)); + size_t outlen; + snappy::RawCompress(input, length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#else + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { +#ifdef SNAPPY + return snappy::GetUncompressedLength(input, length, result); +#else + (void)input; + (void)length; + (void)result; + return false; +#endif +} + +inline bool Snappy_Uncompress(const char* input, size_t length, char* output) { +#ifdef SNAPPY + return snappy::RawUncompress(input, length, output); +#else + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +namespace compression { +// returns size +inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) { + PutVarint32(output, length); + return output->size(); +} + +inline bool GetDecompressedSizeInfo(const char** input_data, + size_t* input_length, + uint32_t* output_len) { + auto new_input_data = + GetVarint32Ptr(*input_data, *input_data + *input_length, output_len); + if (new_input_data == nullptr) { + return false; + } + *input_length -= (new_input_data - *input_data); + *input_data = new_input_data; + return true; +} +} // namespace compression + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool Zlib_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef ZLIB + if (length > std::numeric_limits<uint32_t>::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast<uint32_t>(length)); + } + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(output_header_len + length); + + // The memLevel parameter specifies how much memory should be allocated for + // the internal compression state. + // memLevel=1 uses minimum memory but is slow and reduces compression ratio. + // memLevel=9 uses maximum memory for optimal speed. + // The default value is 8. See zconf.h for more details. + static const int memLevel = 8; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + level = Z_DEFAULT_COMPRESSION; + } else { + level = info.options().level; + } + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits, + memLevel, info.options().strategy); + if (st != Z_OK) { + return false; + } + + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + // Initialize the compression library's dictionary + st = deflateSetDictionary( + &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()), + static_cast<unsigned int>(compression_dict.size())); + if (st != Z_OK) { + deflateEnd(&_stream); + return false; + } + } + + // Compress the input, and put compressed data in output. + _stream.next_in = (Bytef*)input; + _stream.avail_in = static_cast<unsigned int>(length); + + // Initialize the output size. + _stream.avail_out = static_cast<unsigned int>(length); + _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]); + + bool compressed = false; + st = deflate(&_stream, Z_FINISH); + if (st == Z_STREAM_END) { + compressed = true; + output->resize(output->size() - _stream.avail_out); + } + // The only return value we really care about is Z_STREAM_END. + // Z_OK means insufficient output space. This means the compression is + // bigger than decompressed size. Just fail the compression in that case. + + deflateEnd(&_stream); + return compressed; +#else + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr Zlib_Uncompress( + const UncompressionInfo& info, const char* input_data, size_t input_length, + int* decompress_size, uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr, int windowBits = -14) { +#ifdef ZLIB + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast<uint32_t>( + std::min(proposed_output_len, + static_cast<size_t>(std::numeric_limits<uint32_t>::max()))); + } + + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + + // For raw inflate, the windowBits should be -8..-15. + // If windowBits is bigger than zero, it will use either zlib + // header or gzip header. Adding 32 to it will do automatic detection. + int st = + inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits); + if (st != Z_OK) { + return nullptr; + } + + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + // Initialize the compression library's dictionary + st = inflateSetDictionary( + &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()), + static_cast<unsigned int>(compression_dict.size())); + if (st != Z_OK) { + return nullptr; + } + } + + _stream.next_in = (Bytef*)input_data; + _stream.avail_in = static_cast<unsigned int>(input_length); + + auto output = AllocateBlock(output_len, allocator); + + _stream.next_out = (Bytef*)output.get(); + _stream.avail_out = static_cast<unsigned int>(output_len); + + bool done = false; + while (!done) { + st = inflate(&_stream, Z_SYNC_FLUSH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + size_t old_sz = output_len; + uint32_t output_len_delta = output_len / 5; + output_len += output_len_delta < 10 ? 10 : output_len_delta; + auto tmp = AllocateBlock(output_len, allocator); + memcpy(tmp.get(), output.get(), old_sz); + output = std::move(tmp); + + // Set more output. + _stream.next_out = (Bytef*)(output.get() + old_sz); + _stream.avail_out = static_cast<unsigned int>(output_len - old_sz); + break; + } + case Z_BUF_ERROR: + default: + inflateEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + *decompress_size = static_cast<int>(output_len - _stream.avail_out); + inflateEnd(&_stream); + return output; +#else + (void)info; + (void)input_data; + (void)input_length; + (void)decompress_size; + (void)compress_format_version; + (void)allocator; + (void)windowBits; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline bool BZip2_Compress(const CompressionInfo& /*info*/, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef BZIP2 + if (length > std::numeric_limits<uint32_t>::max()) { + // Can't compress more than 4GB + return false; + } + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast<uint32_t>(length)); + } + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(output_header_len + length); + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + // Block size 1 is 100K. + // 0 is for silent. + // 30 is the default workFactor + int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); + if (st != BZ_OK) { + return false; + } + + // Compress the input, and put compressed data in output. + _stream.next_in = (char*)input; + _stream.avail_in = static_cast<unsigned int>(length); + + // Initialize the output size. + _stream.avail_out = static_cast<unsigned int>(length); + _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]); + + bool compressed = false; + st = BZ2_bzCompress(&_stream, BZ_FINISH); + if (st == BZ_STREAM_END) { + compressed = true; + output->resize(output->size() - _stream.avail_out); + } + // The only return value we really care about is BZ_STREAM_END. + // BZ_FINISH_OK means insufficient output space. This means the compression + // is bigger than decompressed size. Just fail the compression in that case. + + BZ2_bzCompressEnd(&_stream); + return compressed; +#else + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline CacheAllocationPtr BZip2_Uncompress( + const char* input_data, size_t input_length, int* decompress_size, + uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) { +#ifdef BZIP2 + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the next page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast<uint32_t>( + std::min(proposed_output_len, + static_cast<size_t>(std::numeric_limits<uint32_t>::max()))); + } + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + int st = BZ2_bzDecompressInit(&_stream, 0, 0); + if (st != BZ_OK) { + return nullptr; + } + + _stream.next_in = (char*)input_data; + _stream.avail_in = static_cast<unsigned int>(input_length); + + auto output = AllocateBlock(output_len, allocator); + + _stream.next_out = (char*)output.get(); + _stream.avail_out = static_cast<unsigned int>(output_len); + + bool done = false; + while (!done) { + st = BZ2_bzDecompress(&_stream); + switch (st) { + case BZ_STREAM_END: + done = true; + break; + case BZ_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + uint32_t old_sz = output_len; + output_len = output_len * 1.2; + auto tmp = AllocateBlock(output_len, allocator); + memcpy(tmp.get(), output.get(), old_sz); + output = std::move(tmp); + + // Set more output. + _stream.next_out = (char*)(output.get() + old_sz); + _stream.avail_out = static_cast<unsigned int>(output_len - old_sz); + break; + } + default: + BZ2_bzDecompressEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + *decompress_size = static_cast<int>(output_len - _stream.avail_out); + BZ2_bzDecompressEnd(&_stream); + return output; +#else + (void)input_data; + (void)input_length; + (void)decompress_size; + (void)compress_format_version; + (void)allocator; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool LZ4_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits<uint32_t>::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast<uint32_t>(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast<char*>(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + int compress_bound = LZ4_compressBound(static_cast<int>(length)); + output->resize(static_cast<size_t>(output_header_len + compress_bound)); + + int outlen; +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_stream_t* stream = LZ4_createStream(); + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + LZ4_loadDict(stream, compression_dict.data(), + static_cast<int>(compression_dict.size())); + } +#if LZ4_VERSION_NUMBER >= 10700 // r129+ + outlen = + LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len], + static_cast<int>(length), compress_bound, 1); +#else // up to r128 + outlen = LZ4_compress_limitedOutput_continue( + stream, input, &(*output)[output_header_len], static_cast<int>(length), + compress_bound); +#endif + LZ4_freeStream(stream); +#else // up to r123 + outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len], + static_cast<int>(length), compress_bound); + (void)ctx; +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (outlen == 0) { + return false; + } + output->resize(static_cast<size_t>(output_header_len + outlen)); + return true; +#else // LZ4 + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info, + const char* input_data, + size_t input_length, + int* decompress_size, + uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr) { +#ifdef LZ4 + uint32_t output_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + if (input_length < 8) { + return nullptr; + } + memcpy(&output_len, input_data, sizeof(output_len)); + input_length -= 8; + input_data += 8; + } + + auto output = AllocateBlock(output_len, allocator); +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_streamDecode_t* stream = LZ4_createStreamDecode(); + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + LZ4_setStreamDecode(stream, compression_dict.data(), + static_cast<int>(compression_dict.size())); + } + *decompress_size = LZ4_decompress_safe_continue( + stream, input_data, output.get(), static_cast<int>(input_length), + static_cast<int>(output_len)); + LZ4_freeStreamDecode(stream); +#else // up to r123 + *decompress_size = LZ4_decompress_safe(input_data, output.get(), + static_cast<int>(input_length), + static_cast<int>(output_len)); + (void)ctx; +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (*decompress_size < 0) { + return nullptr; + } + assert(*decompress_size == static_cast<int>(output_len)); + return output; +#else // LZ4 + (void)info; + (void)input_data; + (void)input_length; + (void)decompress_size; + (void)compress_format_version; + (void)allocator; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool LZ4HC_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits<uint32_t>::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast<uint32_t>(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast<char*>(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + int compress_bound = LZ4_compressBound(static_cast<int>(length)); + output->resize(static_cast<size_t>(output_header_len + compress_bound)); + + int outlen; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + level = 0; // lz4hc.h says any value < 1 will be sanitized to default + } else { + level = info.options().level; + } +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_streamHC_t* stream = LZ4_createStreamHC(); + LZ4_resetStreamHC(stream, level); + Slice compression_dict = info.dict().GetRawDict(); + const char* compression_dict_data = + compression_dict.size() > 0 ? compression_dict.data() : nullptr; + size_t compression_dict_size = compression_dict.size(); + LZ4_loadDictHC(stream, compression_dict_data, + static_cast<int>(compression_dict_size)); + +#if LZ4_VERSION_NUMBER >= 10700 // r129+ + outlen = + LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len], + static_cast<int>(length), compress_bound); +#else // r124-r128 + outlen = LZ4_compressHC_limitedOutput_continue( + stream, input, &(*output)[output_header_len], static_cast<int>(length), + compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10700 + LZ4_freeStreamHC(stream); + +#elif LZ4_VERSION_MAJOR // r113-r123 + outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len], + static_cast<int>(length), + compress_bound, level); +#else // up to r112 + outlen = + LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len], + static_cast<int>(length), compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (outlen == 0) { + return false; + } + output->resize(static_cast<size_t>(output_header_len + outlen)); + return true; +#else // LZ4 + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +#ifdef XPRESS +inline bool XPRESS_Compress(const char* input, size_t length, + std::string* output) { + return port::xpress::Compress(input, length, output); +} +#else +inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/, + std::string* /*output*/) { + return false; +} +#endif + +#ifdef XPRESS +inline char* XPRESS_Uncompress(const char* input_data, size_t input_length, + int* decompress_size) { + return port::xpress::Decompress(input_data, input_length, decompress_size); +} +#else +inline char* XPRESS_Uncompress(const char* /*input_data*/, + size_t /*input_length*/, + int* /*decompress_size*/) { + return nullptr; +} +#endif + +inline bool ZSTD_Compress(const CompressionInfo& info, const char* input, + size_t length, ::std::string* output) { +#ifdef ZSTD + if (length > std::numeric_limits<uint32_t>::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast<uint32_t>(length)); + + size_t compressBound = ZSTD_compressBound(length); + output->resize(static_cast<size_t>(output_header_len + compressBound)); + size_t outlen = 0; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } else { + level = info.options().level; + } +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + ZSTD_CCtx* context = info.context().ZSTDPreallocCtx(); + assert(context != nullptr); +#if ZSTD_VERSION_NUMBER >= 700 // v0.7.0+ + if (info.dict().GetDigestedZstdCDict() != nullptr) { + outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len], + compressBound, input, length, + info.dict().GetDigestedZstdCDict()); + } +#endif // ZSTD_VERSION_NUMBER >= 700 + if (outlen == 0) { + outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len], + compressBound, input, length, + info.dict().GetRawDict().data(), + info.dict().GetRawDict().size(), level); + } +#else // up to v0.4.x + outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input, + length, level); +#endif // ZSTD_VERSION_NUMBER >= 500 + if (outlen == 0) { + return false; + } + output->resize(output_header_len + outlen); + return true; +#else // ZSTD + (void)info; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr ZSTD_Uncompress( + const UncompressionInfo& info, const char* input_data, size_t input_length, + int* decompress_size, MemoryAllocator* allocator = nullptr) { +#ifdef ZSTD + uint32_t output_len = 0; + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + + auto output = AllocateBlock(output_len, allocator); + size_t actual_output_length = 0; +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + ZSTD_DCtx* context = info.context().GetZSTDContext(); + assert(context != nullptr); +#ifdef ROCKSDB_ZSTD_DDICT + if (info.dict().GetDigestedZstdDDict() != nullptr) { + actual_output_length = ZSTD_decompress_usingDDict( + context, output.get(), output_len, input_data, input_length, + info.dict().GetDigestedZstdDDict()); + } +#endif // ROCKSDB_ZSTD_DDICT + if (actual_output_length == 0) { + actual_output_length = ZSTD_decompress_usingDict( + context, output.get(), output_len, input_data, input_length, + info.dict().GetRawDict().data(), info.dict().GetRawDict().size()); + } +#else // up to v0.4.x + (void)info; + actual_output_length = + ZSTD_decompress(output.get(), output_len, input_data, input_length); +#endif // ZSTD_VERSION_NUMBER >= 500 + assert(actual_output_length == output_len); + *decompress_size = static_cast<int>(actual_output_length); + return output; +#else // ZSTD + (void)info; + (void)input_data; + (void)input_length; + (void)decompress_size; + (void)allocator; + return nullptr; +#endif +} + +inline bool ZSTD_TrainDictionarySupported() { +#ifdef ZSTD + // Dictionary trainer is available since v0.6.1 for static linking, but not + // available for dynamic linking until v1.1.3. For now we enable the feature + // in v1.1.3+ only. + return (ZSTD_versionNumber() >= 10103); +#else + return false; +#endif +} + +inline std::string ZSTD_TrainDictionary(const std::string& samples, + const std::vector<size_t>& sample_lens, + size_t max_dict_bytes) { + // Dictionary trainer is available since v0.6.1 for static linking, but not + // available for dynamic linking until v1.1.3. For now we enable the feature + // in v1.1.3+ only. +#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ + assert(samples.empty() == sample_lens.empty()); + if (samples.empty()) { + return ""; + } + std::string dict_data(max_dict_bytes, '\0'); + size_t dict_len = ZDICT_trainFromBuffer( + &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0], + static_cast<unsigned>(sample_lens.size())); + if (ZDICT_isError(dict_len)) { + return ""; + } + assert(dict_len <= max_dict_bytes); + dict_data.resize(dict_len); + return dict_data; +#else // up to v1.1.2 + assert(false); + (void)samples; + (void)sample_lens; + (void)max_dict_bytes; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10103 +} + +inline std::string ZSTD_TrainDictionary(const std::string& samples, + size_t sample_len_shift, + size_t max_dict_bytes) { + // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable + // only since v0.8.0. For now we enable the feature in stable versions only. +#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ + // skips potential partial sample at the end of "samples" + size_t num_samples = samples.size() >> sample_len_shift; + std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift); + return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes); +#else // up to v1.1.2 + assert(false); + (void)samples; + (void)sample_len_shift; + (void)max_dict_bytes; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10103 +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/compression_context_cache.cc b/src/rocksdb/util/compression_context_cache.cc new file mode 100644 index 00000000..6fb5c4fc --- /dev/null +++ b/src/rocksdb/util/compression_context_cache.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#include "util/compression_context_cache.h" + +#include "util/compression.h" +#include "util/core_local.h" + +#include <atomic> + +namespace rocksdb { +namespace compression_cache { + +void* const SentinelValue = nullptr; +// Cache ZSTD uncompression contexts for reads +// if needed we can add ZSTD compression context caching +// which is currently is not done since BlockBasedTableBuilder +// simply creates one compression context per new SST file. +struct ZSTDCachedData { + // We choose to cache the below structure instead of a ptr + // because we want to avoid a) native types leak b) make + // cache use transparent for the user + ZSTDUncompressCachedData uncomp_cached_data_; + std::atomic<void*> zstd_uncomp_sentinel_; + + char + padding[(CACHE_LINE_SIZE - + (sizeof(ZSTDUncompressCachedData) + sizeof(std::atomic<void*>)) % + CACHE_LINE_SIZE)]; // unused padding field + + ZSTDCachedData() : zstd_uncomp_sentinel_(&uncomp_cached_data_) {} + ZSTDCachedData(const ZSTDCachedData&) = delete; + ZSTDCachedData& operator=(const ZSTDCachedData&) = delete; + + ZSTDUncompressCachedData GetUncompressData(int64_t idx) { + ZSTDUncompressCachedData result; + void* expected = &uncomp_cached_data_; + if (zstd_uncomp_sentinel_.compare_exchange_strong(expected, + SentinelValue)) { + uncomp_cached_data_.CreateIfNeeded(); + result.InitFromCache(uncomp_cached_data_, idx); + } else { + // Creates one time use data + result.CreateIfNeeded(); + } + return result; + } + // Return the entry back into circulation + // This is executed only when we successfully obtained + // in the first place + void ReturnUncompressData() { + if (zstd_uncomp_sentinel_.exchange(&uncomp_cached_data_) != SentinelValue) { + // Means we are returning while not having it acquired. + assert(false); + } + } +}; +static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0, + "Expected CACHE_LINE_SIZE alignment"); +} // namespace compression_cache + +using namespace compression_cache; + +class CompressionContextCache::Rep { + public: + Rep() {} + ZSTDUncompressCachedData GetZSTDUncompressData() { + auto p = per_core_uncompr_.AccessElementAndIndex(); + int64_t idx = static_cast<int64_t>(p.second); + return p.first->GetUncompressData(idx); + } + void ReturnZSTDUncompressData(int64_t idx) { + assert(idx >= 0); + auto* cn = per_core_uncompr_.AccessAtCore(static_cast<size_t>(idx)); + cn->ReturnUncompressData(); + } + + private: + CoreLocalArray<ZSTDCachedData> per_core_uncompr_; +}; + +CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {} + +CompressionContextCache* CompressionContextCache::Instance() { + static CompressionContextCache instance; + return &instance; +} + +void CompressionContextCache::InitSingleton() { Instance(); } + +ZSTDUncompressCachedData +CompressionContextCache::GetCachedZSTDUncompressData() { + return rep_->GetZSTDUncompressData(); +} + +void CompressionContextCache::ReturnCachedZSTDUncompressData(int64_t idx) { + rep_->ReturnZSTDUncompressData(idx); +} + +CompressionContextCache::~CompressionContextCache() { delete rep_; } + +} // namespace rocksdb diff --git a/src/rocksdb/util/compression_context_cache.h b/src/rocksdb/util/compression_context_cache.h new file mode 100644 index 00000000..4ea6ae35 --- /dev/null +++ b/src/rocksdb/util/compression_context_cache.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +// Compression context cache allows to cache compression/uncompression contexts +// This helps with Random Read latencies and reduces CPU utilization +// Caching is implemented using CoreLocal facility. Compression/Uncompression +// instances are cached on a per core basis using CoreLocalArray. A borrowed +// instance is atomically replaced with a sentinel value for the time of being +// used. If it turns out that another thread is already makes use of the +// instance we still create one on the heap which is later is destroyed. + +#pragma once + +#include <stdint.h> + +namespace rocksdb { +class ZSTDUncompressCachedData; + +class CompressionContextCache { + public: + // Singleton + static CompressionContextCache* Instance(); + static void InitSingleton(); + CompressionContextCache(const CompressionContextCache&) = delete; + CompressionContextCache& operator=(const CompressionContextCache&) = delete; + + ZSTDUncompressCachedData GetCachedZSTDUncompressData(); + void ReturnCachedZSTDUncompressData(int64_t idx); + + private: + // Singleton + CompressionContextCache(); + ~CompressionContextCache(); + + class Rep; + Rep* rep_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/concurrent_arena.cc b/src/rocksdb/util/concurrent_arena.cc new file mode 100644 index 00000000..cef77d7e --- /dev/null +++ b/src/rocksdb/util/concurrent_arena.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/concurrent_arena.h" +#include <thread> +#include "port/port.h" +#include "util/random.h" + +namespace rocksdb { + +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +__thread size_t ConcurrentArena::tls_cpuid = 0; +#endif + +namespace { +// If the shard block size is too large, in the worst case, every core +// allocates a block without populate it. If the shared block size is +// 1MB, 64 cores will quickly allocate 64MB, and may quickly trigger a +// flush. Cap the size instead. +const size_t kMaxShardBlockSize = size_t{128 * 1024}; +} // namespace + +ConcurrentArena::ConcurrentArena(size_t block_size, AllocTracker* tracker, + size_t huge_page_size) + : shard_block_size_(std::min(kMaxShardBlockSize, block_size / 8)), + shards_(), + arena_(block_size, tracker, huge_page_size) { + Fixup(); +} + +ConcurrentArena::Shard* ConcurrentArena::Repick() { + auto shard_and_index = shards_.AccessElementAndIndex(); +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL + // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we + // have repicked + tls_cpuid = shard_and_index.second | shards_.Size(); +#endif + return shard_and_index.first; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/concurrent_arena.h b/src/rocksdb/util/concurrent_arena.h new file mode 100644 index 00000000..a6191100 --- /dev/null +++ b/src/rocksdb/util/concurrent_arena.h @@ -0,0 +1,215 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <atomic> +#include <memory> +#include <utility> +#include "port/likely.h" +#include "util/allocator.h" +#include "util/arena.h" +#include "util/core_local.h" +#include "util/mutexlock.h" +#include "util/thread_local.h" + +// Only generate field unused warning for padding array, or build under +// GCC 4.8.1 will fail. +#ifdef __clang__ +#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__)) +#else +#define ROCKSDB_FIELD_UNUSED +#endif // __clang__ + +namespace rocksdb { + +class Logger; + +// ConcurrentArena wraps an Arena. It makes it thread safe using a fast +// inlined spinlock, and adds small per-core allocation caches to avoid +// contention for small allocations. To avoid any memory waste from the +// per-core shards, they are kept small, they are lazily instantiated +// only if ConcurrentArena actually notices concurrent use, and they +// adjust their size so that there is no fragmentation waste when the +// shard blocks are allocated from the underlying main arena. +class ConcurrentArena : public Allocator { + public: + // block_size and huge_page_size are the same as for Arena (and are + // in fact just passed to the constructor of arena_. The core-local + // shards compute their shard_block_size as a fraction of block_size + // that varies according to the hardware concurrency level. + explicit ConcurrentArena(size_t block_size = Arena::kMinBlockSize, + AllocTracker* tracker = nullptr, + size_t huge_page_size = 0); + + char* Allocate(size_t bytes) override { + return AllocateImpl(bytes, false /*force_arena*/, + [=]() { return arena_.Allocate(bytes); }); + } + + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) override { + size_t rounded_up = ((bytes - 1) | (sizeof(void*) - 1)) + 1; + assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) && + (rounded_up % sizeof(void*)) == 0); + + return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, [=]() { + return arena_.AllocateAligned(rounded_up, huge_page_size, logger); + }); + } + + size_t ApproximateMemoryUsage() const { + std::unique_lock<SpinMutex> lock(arena_mutex_, std::defer_lock); + lock.lock(); + return arena_.ApproximateMemoryUsage() - ShardAllocatedAndUnused(); + } + + size_t MemoryAllocatedBytes() const { + return memory_allocated_bytes_.load(std::memory_order_relaxed); + } + + size_t AllocatedAndUnused() const { + return arena_allocated_and_unused_.load(std::memory_order_relaxed) + + ShardAllocatedAndUnused(); + } + + size_t IrregularBlockNum() const { + return irregular_block_num_.load(std::memory_order_relaxed); + } + + size_t BlockSize() const override { return arena_.BlockSize(); } + + private: + struct Shard { + char padding[40] ROCKSDB_FIELD_UNUSED; + mutable SpinMutex mutex; + char* free_begin_; + std::atomic<size_t> allocated_and_unused_; + + Shard() : free_begin_(nullptr), allocated_and_unused_(0) {} + }; + +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL + static __thread size_t tls_cpuid; +#else + enum ZeroFirstEnum : size_t { tls_cpuid = 0 }; +#endif + + char padding0[56] ROCKSDB_FIELD_UNUSED; + + size_t shard_block_size_; + + CoreLocalArray<Shard> shards_; + + Arena arena_; + mutable SpinMutex arena_mutex_; + std::atomic<size_t> arena_allocated_and_unused_; + std::atomic<size_t> memory_allocated_bytes_; + std::atomic<size_t> irregular_block_num_; + + char padding1[56] ROCKSDB_FIELD_UNUSED; + + Shard* Repick(); + + size_t ShardAllocatedAndUnused() const { + size_t total = 0; + for (size_t i = 0; i < shards_.Size(); ++i) { + total += shards_.AccessAtCore(i)->allocated_and_unused_.load( + std::memory_order_relaxed); + } + return total; + } + + template <typename Func> + char* AllocateImpl(size_t bytes, bool force_arena, const Func& func) { + size_t cpu; + + // Go directly to the arena if the allocation is too large, or if + // we've never needed to Repick() and the arena mutex is available + // with no waiting. This keeps the fragmentation penalty of + // concurrency zero unless it might actually confer an advantage. + std::unique_lock<SpinMutex> arena_lock(arena_mutex_, std::defer_lock); + if (bytes > shard_block_size_ / 4 || force_arena || + ((cpu = tls_cpuid) == 0 && + !shards_.AccessAtCore(0)->allocated_and_unused_.load( + std::memory_order_relaxed) && + arena_lock.try_lock())) { + if (!arena_lock.owns_lock()) { + arena_lock.lock(); + } + auto rv = func(); + Fixup(); + return rv; + } + + // pick a shard from which to allocate + Shard* s = shards_.AccessAtCore(cpu & (shards_.Size() - 1)); + if (!s->mutex.try_lock()) { + s = Repick(); + s->mutex.lock(); + } + std::unique_lock<SpinMutex> lock(s->mutex, std::adopt_lock); + + size_t avail = s->allocated_and_unused_.load(std::memory_order_relaxed); + if (avail < bytes) { + // reload + std::lock_guard<SpinMutex> reload_lock(arena_mutex_); + + // If the arena's current block is within a factor of 2 of the right + // size, we adjust our request to avoid arena waste. + auto exact = arena_allocated_and_unused_.load(std::memory_order_relaxed); + assert(exact == arena_.AllocatedAndUnused()); + + if (exact >= bytes && arena_.IsInInlineBlock()) { + // If we haven't exhausted arena's inline block yet, allocate from arena + // directly. This ensures that we'll do the first few small allocations + // without allocating any blocks. + // In particular this prevents empty memtables from using + // disproportionately large amount of memory: a memtable allocates on + // the order of 1 KB of memory when created; we wouldn't want to + // allocate a full arena block (typically a few megabytes) for that, + // especially if there are thousands of empty memtables. + auto rv = func(); + Fixup(); + return rv; + } + + avail = exact >= shard_block_size_ / 2 && exact < shard_block_size_ * 2 + ? exact + : shard_block_size_; + s->free_begin_ = arena_.AllocateAligned(avail); + Fixup(); + } + s->allocated_and_unused_.store(avail - bytes, std::memory_order_relaxed); + + char* rv; + if ((bytes % sizeof(void*)) == 0) { + // aligned allocation from the beginning + rv = s->free_begin_; + s->free_begin_ += bytes; + } else { + // unaligned from the end + rv = s->free_begin_ + avail - bytes; + } + return rv; + } + + void Fixup() { + arena_allocated_and_unused_.store(arena_.AllocatedAndUnused(), + std::memory_order_relaxed); + memory_allocated_bytes_.store(arena_.MemoryAllocatedBytes(), + std::memory_order_relaxed); + irregular_block_num_.store(arena_.IrregularBlockNum(), + std::memory_order_relaxed); + } + + ConcurrentArena(const ConcurrentArena&) = delete; + ConcurrentArena& operator=(const ConcurrentArena&) = delete; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.cc b/src/rocksdb/util/concurrent_task_limiter_impl.cc new file mode 100644 index 00000000..e1ce4bef --- /dev/null +++ b/src/rocksdb/util/concurrent_task_limiter_impl.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/concurrent_task_limiter_impl.h" +#include "rocksdb/concurrent_task_limiter.h" + +namespace rocksdb { + +ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl( + const std::string& name, int32_t max_outstanding_task) + : name_(name), + max_outstanding_tasks_{max_outstanding_task}, + outstanding_tasks_{0} { + +} + +ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() { + assert(outstanding_tasks_ == 0); +} + +const std::string& ConcurrentTaskLimiterImpl::GetName() const { + return name_; +} + +void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) { + max_outstanding_tasks_.store(limit, std::memory_order_relaxed); +} + +void ConcurrentTaskLimiterImpl::ResetMaxOutstandingTask() { + max_outstanding_tasks_.store(-1, std::memory_order_relaxed); +} + +int32_t ConcurrentTaskLimiterImpl::GetOutstandingTask() const { + return outstanding_tasks_.load(std::memory_order_relaxed); +} + +std::unique_ptr<TaskLimiterToken> ConcurrentTaskLimiterImpl::GetToken( + bool force) { + int32_t limit = max_outstanding_tasks_.load(std::memory_order_relaxed); + int32_t tasks = outstanding_tasks_.load(std::memory_order_relaxed); + // force = true, bypass the throttle. + // limit < 0 means unlimited tasks. + while (force || limit < 0 || tasks < limit) { + if (outstanding_tasks_.compare_exchange_weak(tasks, tasks + 1)) { + return std::unique_ptr<TaskLimiterToken>(new TaskLimiterToken(this)); + } + } + return nullptr; +} + +ConcurrentTaskLimiter* NewConcurrentTaskLimiter( + const std::string& name, int32_t limit) { + return new ConcurrentTaskLimiterImpl(name, limit); +} + +TaskLimiterToken::~TaskLimiterToken() { + --limiter_->outstanding_tasks_; + assert(limiter_->outstanding_tasks_ >= 0); +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.h b/src/rocksdb/util/concurrent_task_limiter_impl.h new file mode 100644 index 00000000..515f1481 --- /dev/null +++ b/src/rocksdb/util/concurrent_task_limiter_impl.h @@ -0,0 +1,68 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <atomic> +#include <memory> + +#include "rocksdb/env.h" +#include "rocksdb/concurrent_task_limiter.h" + +namespace rocksdb { + +class TaskLimiterToken; + +class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter { + public: + explicit ConcurrentTaskLimiterImpl(const std::string& name, + int32_t max_outstanding_task); + + virtual ~ConcurrentTaskLimiterImpl(); + + virtual const std::string& GetName() const override; + + virtual void SetMaxOutstandingTask(int32_t limit) override; + + virtual void ResetMaxOutstandingTask() override; + + virtual int32_t GetOutstandingTask() const override; + + // Request token for adding a new task. + // If force == true, it requests a token bypassing throttle. + // Returns nullptr if it got throttled. + virtual std::unique_ptr<TaskLimiterToken> GetToken(bool force); + + private: + friend class TaskLimiterToken; + + std::string name_; + std::atomic<int32_t> max_outstanding_tasks_; + std::atomic<int32_t> outstanding_tasks_; + + // No copying allowed + ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete; + ConcurrentTaskLimiterImpl& operator=( + const ConcurrentTaskLimiterImpl&) = delete; +}; + +class TaskLimiterToken { + public: + explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter) + : limiter_(limiter) {} + ~TaskLimiterToken(); + + private: + ConcurrentTaskLimiterImpl* limiter_; + + // no copying allowed + TaskLimiterToken(const TaskLimiterToken&) = delete; + void operator=(const TaskLimiterToken&) = delete; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h new file mode 100644 index 00000000..4cc4fd90 --- /dev/null +++ b/src/rocksdb/util/core_local.h @@ -0,0 +1,83 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cstddef> +#include <thread> +#include <utility> +#include <vector> + +#include "port/likely.h" +#include "port/port.h" +#include "util/random.h" + +namespace rocksdb { + +// An array of core-local values. Ideally the value type, T, is cache aligned to +// prevent false sharing. +template <typename T> +class CoreLocalArray { + public: + CoreLocalArray(); + + size_t Size() const; + // returns pointer to the element corresponding to the core that the thread + // currently runs on. + T* Access() const; + // same as above, but also returns the core index, which the client can cache + // to reduce how often core ID needs to be retrieved. Only do this if some + // inaccuracy is tolerable, as the thread may migrate to a different core. + std::pair<T*, size_t> AccessElementAndIndex() const; + // returns pointer to element for the specified core index. This can be used, + // e.g., for aggregation, or if the client caches core index. + T* AccessAtCore(size_t core_idx) const; + + private: + std::unique_ptr<T[]> data_; + int size_shift_; +}; + +template <typename T> +CoreLocalArray<T>::CoreLocalArray() { + int num_cpus = static_cast<int>(std::thread::hardware_concurrency()); + // find a power of two >= num_cpus and >= 8 + size_shift_ = 3; + while (1 << size_shift_ < num_cpus) { + ++size_shift_; + } + data_.reset(new T[static_cast<size_t>(1) << size_shift_]); +} + +template <typename T> +size_t CoreLocalArray<T>::Size() const { + return static_cast<size_t>(1) << size_shift_; +} + +template <typename T> +T* CoreLocalArray<T>::Access() const { + return AccessElementAndIndex().first; +} + +template <typename T> +std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const { + int cpuid = port::PhysicalCoreID(); + size_t core_idx; + if (UNLIKELY(cpuid < 0)) { + // cpu id unavailable, just pick randomly + core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_); + } else { + core_idx = static_cast<size_t>(cpuid & ((1 << size_shift_) - 1)); + } + return {AccessAtCore(core_idx), core_idx}; +} + +template <typename T> +T* CoreLocalArray<T>::AccessAtCore(size_t core_idx) const { + assert(core_idx < static_cast<size_t>(1) << size_shift_); + return &data_[core_idx]; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/crc32c.cc b/src/rocksdb/util/crc32c.cc new file mode 100644 index 00000000..9e4b65e6 --- /dev/null +++ b/src/rocksdb/util/crc32c.cc @@ -0,0 +1,1231 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. +#include "util/crc32c.h" +#include <stdint.h> +#ifdef HAVE_SSE42 +#include <nmmintrin.h> +#include <wmmintrin.h> +#endif +#include "util/coding.h" +#include "util/util.h" + +#ifdef __powerpc64__ +#include "util/crc32c_ppc.h" +#include "util/crc32c_ppc_constants.h" + +#if __linux__ +#include <sys/auxv.h> + +#ifndef PPC_FEATURE2_VEC_CRYPTO +#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#endif + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#endif /* __linux__ */ + +#endif + +namespace rocksdb { +namespace crc32c { + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +#ifdef __powerpc64__ +static int arch_ppc_crc32 = 0; +#endif /* __powerpc64__ */ +#endif + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast<const char*>(p)); +} + +#if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64)) +static inline uint64_t LE_LOAD64(const uint8_t *p) { + return DecodeFixed64(reinterpret_cast<const char*>(p)); +} +#endif + +static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { + uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; + // DO it twice. + c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; +} + +static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { +#ifndef HAVE_SSE42 + Slow_CRC32(l, p); +#elif defined(__LP64__) || defined(_WIN64) + *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *p += 8; +#else + *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); + *p += 4; + *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p)); + *p += 4; +#endif +} + +template<void (*CRC32)(uint64_t*, uint8_t const**)> +uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { + + const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; + +// Align n to (1 << m) byte boundary +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) + + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast<uintptr_t>(p); + const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4)); + if (x <= e) { + // Process bytes until finished or p is 16-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + CRC32(&l, &p); + CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e-p) >= 8) { + CRC32(&l, &p); + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP1 +#undef ALIGN + return static_cast<uint32_t>(l ^ 0xffffffffu); +} + +// Detect if SS42 or not. +#ifndef HAVE_POWER8 + +static bool isSSE42() { +#ifndef HAVE_SSE42 + return false; +#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) + uint32_t c_; + __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); + return c_ & (1U << 20); // copied from CpuId.h in Folly. Test SSE42 +#elif defined(_WIN64) + int info[4]; + __cpuidex(info, 0x00000001, 0); + return (info[2] & ((int)1 << 20)) != 0; +#else + return false; +#endif +} + +static bool isPCLMULQDQ() { +#ifndef HAVE_SSE42 +// in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ are +// supported by compiler + return false; +#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) + uint32_t c_; + __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); + return c_ & (1U << 1); // PCLMULQDQ is in bit 1 (not bit 0) +#elif defined(_WIN64) + int info[4]; + __cpuidex(info, 0x00000001, 0); + return (info[2] & ((int)1 << 1)) != 0; +#else + return false; +#endif +} + +#endif // HAVE_POWER8 + +typedef uint32_t (*Function)(uint32_t, const char*, size_t); + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) { + return crc32c_ppc(crc, (const unsigned char *)buf, size); +} + +#if __linux__ +static int arch_ppc_probe(void) { + arch_ppc_crc32 = 0; + +#if defined(__powerpc64__) + if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1; +#endif /* __powerpc64__ */ + + return arch_ppc_crc32; +} +#endif // __linux__ + +static bool isAltiVec() { + if (arch_ppc_probe()) { + return true; + } else { + return false; + } +} +#endif + + +std::string IsFastCrc32Supported() { + bool has_fast_crc = false; + std::string fast_zero_msg; + std::string arch; +#ifdef HAVE_POWER8 +#ifdef HAS_ALTIVEC + if (arch_ppc_probe()) { + has_fast_crc = true; + arch = "PPC"; + } +#else + has_fast_crc = false; + arch = "PPC"; +#endif +#else + has_fast_crc = isSSE42(); + arch = "x86"; +#endif + if (has_fast_crc) { + fast_zero_msg.append("Supported on " + arch); + } + else { + fast_zero_msg.append("Not supported on " + arch); + } + return fast_zero_msg; +} + + +/* + * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * Ferry Toth + * ftoth@exalondelft.nl + * + * https://github.com/htot/crc32c + * + * Modified by Facebook + * + * Original intel whitepaper: + * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * + * This version is from the folly library, created by Dave Watson <davejwatson@fb.com> + * +*/ +#if defined HAVE_SSE42 && defined HAVE_PCLMUL + +#define CRCtriplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \ + crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); + +#define CRCduplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); + +#define CRCsinglet(crc, buf, offset) \ + crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); + + +// Numbers taken directly from intel whitepaper. +// clang-format off +const uint64_t clmul_constants[] = { + 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6, + 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e, + 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da, + 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8, + 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296, + 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2, + 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6, + 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092, + 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0, + 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456, + 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e, + 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a, + 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574, + 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832, + 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124, + 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86, + 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e, + 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a, + 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46, + 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a, + 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a, + 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4, + 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56, + 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2, + 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c, + 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac, + 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64, + 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e, + 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c, + 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28, + 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26, + 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c, + 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c, + 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c, + 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4, + 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844, + 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c, + 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730, + 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c, + 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2, + 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2, + 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e, + 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a, + 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a, + 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a, + 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768, + 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4, + 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c, + 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba, + 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312, + 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544, + 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a, + 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e, + 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a, + 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c, + 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a, + 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6, + 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca, + 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888, + 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e, + 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528, + 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a, + 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e, + 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa, +}; + +// Compute the crc32c value for buffer smaller than 8 +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void align_to_8( + size_t len, + uint64_t& crc0, // crc so far, updated on return + const unsigned char*& next) { // next data pointer, updated on return + uint32_t crc32bit = static_cast<uint32_t>(crc0); + if (len & 0x04) { + crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next); + next += sizeof(uint32_t); + } + if (len & 0x02) { + crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next); + next += sizeof(uint16_t); + } + if (len & 0x01) { + crc32bit = _mm_crc32_u8(crc32bit, *(next)); + next++; + } + crc0 = crc32bit; +} + +// +// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well +// chosen constant and xor's these with the remaining CRC. +// +inline uint64_t CombineCRC( + size_t block_size, + uint64_t crc0, + uint64_t crc1, + uint64_t crc2, + const uint64_t* next2) { + const auto multiplier = + *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1); + const auto crc0_xmm = _mm_set_epi64x(0, crc0); + const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00); + const auto crc1_xmm = _mm_set_epi64x(0, crc1); + const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10); + const auto res = _mm_xor_si128(res0, res1); + crc0 = _mm_cvtsi128_si64(res); + crc0 = crc0 ^ *((uint64_t*)next2 - 1); + crc2 = _mm_crc32_u64(crc2, crc0); + return crc2; +} + +// Compute CRC-32C using the Intel hardware instruction. +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { + const unsigned char* next = (const unsigned char*)buf; + uint64_t count; + uint64_t crc0, crc1, crc2; + crc0 = crc ^ 0xffffffffu; + + + if (len >= 8) { + // if len > 216 then align and use triplets + if (len > 216) { + { + // Work on the bytes (< 8) before the first 8-byte alignment addr starts + uint64_t align_bytes = (8 - (uintptr_t)next) & 7; + len -= align_bytes; + align_to_8(align_bytes, crc0, next); + } + + // Now work on the remaining blocks + count = len / 24; // number of triplets + len %= 24; // bytes remaining + uint64_t n = count >> 7; // #blocks = first block + full blocks + uint64_t block_size = count & 127; + if (block_size == 0) { + block_size = 128; + } else { + n++; + } + // points to the first byte of the next block + const uint64_t* next0 = (uint64_t*)next + block_size; + const uint64_t* next1 = next0 + block_size; + const uint64_t* next2 = next1 + block_size; + + crc1 = crc2 = 0; + // Use Duff's device, a for() loop inside a switch() + // statement. This needs to execute at least once, round len + // down to nearest triplet multiple + switch (block_size) { + case 128: + do { + // jumps here for a full block of len 128 + CRCtriplet(crc, next, -128); + FALLTHROUGH_INTENDED; + case 127: + // jumps here or below for the first block smaller + CRCtriplet(crc, next, -127); + FALLTHROUGH_INTENDED; + case 126: + CRCtriplet(crc, next, -126); // than 128 + FALLTHROUGH_INTENDED; + case 125: + CRCtriplet(crc, next, -125); + FALLTHROUGH_INTENDED; + case 124: + CRCtriplet(crc, next, -124); + FALLTHROUGH_INTENDED; + case 123: + CRCtriplet(crc, next, -123); + FALLTHROUGH_INTENDED; + case 122: + CRCtriplet(crc, next, -122); + FALLTHROUGH_INTENDED; + case 121: + CRCtriplet(crc, next, -121); + FALLTHROUGH_INTENDED; + case 120: + CRCtriplet(crc, next, -120); + FALLTHROUGH_INTENDED; + case 119: + CRCtriplet(crc, next, -119); + FALLTHROUGH_INTENDED; + case 118: + CRCtriplet(crc, next, -118); + FALLTHROUGH_INTENDED; + case 117: + CRCtriplet(crc, next, -117); + FALLTHROUGH_INTENDED; + case 116: + CRCtriplet(crc, next, -116); + FALLTHROUGH_INTENDED; + case 115: + CRCtriplet(crc, next, -115); + FALLTHROUGH_INTENDED; + case 114: + CRCtriplet(crc, next, -114); + FALLTHROUGH_INTENDED; + case 113: + CRCtriplet(crc, next, -113); + FALLTHROUGH_INTENDED; + case 112: + CRCtriplet(crc, next, -112); + FALLTHROUGH_INTENDED; + case 111: + CRCtriplet(crc, next, -111); + FALLTHROUGH_INTENDED; + case 110: + CRCtriplet(crc, next, -110); + FALLTHROUGH_INTENDED; + case 109: + CRCtriplet(crc, next, -109); + FALLTHROUGH_INTENDED; + case 108: + CRCtriplet(crc, next, -108); + FALLTHROUGH_INTENDED; + case 107: + CRCtriplet(crc, next, -107); + FALLTHROUGH_INTENDED; + case 106: + CRCtriplet(crc, next, -106); + FALLTHROUGH_INTENDED; + case 105: + CRCtriplet(crc, next, -105); + FALLTHROUGH_INTENDED; + case 104: + CRCtriplet(crc, next, -104); + FALLTHROUGH_INTENDED; + case 103: + CRCtriplet(crc, next, -103); + FALLTHROUGH_INTENDED; + case 102: + CRCtriplet(crc, next, -102); + FALLTHROUGH_INTENDED; + case 101: + CRCtriplet(crc, next, -101); + FALLTHROUGH_INTENDED; + case 100: + CRCtriplet(crc, next, -100); + FALLTHROUGH_INTENDED; + case 99: + CRCtriplet(crc, next, -99); + FALLTHROUGH_INTENDED; + case 98: + CRCtriplet(crc, next, -98); + FALLTHROUGH_INTENDED; + case 97: + CRCtriplet(crc, next, -97); + FALLTHROUGH_INTENDED; + case 96: + CRCtriplet(crc, next, -96); + FALLTHROUGH_INTENDED; + case 95: + CRCtriplet(crc, next, -95); + FALLTHROUGH_INTENDED; + case 94: + CRCtriplet(crc, next, -94); + FALLTHROUGH_INTENDED; + case 93: + CRCtriplet(crc, next, -93); + FALLTHROUGH_INTENDED; + case 92: + CRCtriplet(crc, next, -92); + FALLTHROUGH_INTENDED; + case 91: + CRCtriplet(crc, next, -91); + FALLTHROUGH_INTENDED; + case 90: + CRCtriplet(crc, next, -90); + FALLTHROUGH_INTENDED; + case 89: + CRCtriplet(crc, next, -89); + FALLTHROUGH_INTENDED; + case 88: + CRCtriplet(crc, next, -88); + FALLTHROUGH_INTENDED; + case 87: + CRCtriplet(crc, next, -87); + FALLTHROUGH_INTENDED; + case 86: + CRCtriplet(crc, next, -86); + FALLTHROUGH_INTENDED; + case 85: + CRCtriplet(crc, next, -85); + FALLTHROUGH_INTENDED; + case 84: + CRCtriplet(crc, next, -84); + FALLTHROUGH_INTENDED; + case 83: + CRCtriplet(crc, next, -83); + FALLTHROUGH_INTENDED; + case 82: + CRCtriplet(crc, next, -82); + FALLTHROUGH_INTENDED; + case 81: + CRCtriplet(crc, next, -81); + FALLTHROUGH_INTENDED; + case 80: + CRCtriplet(crc, next, -80); + FALLTHROUGH_INTENDED; + case 79: + CRCtriplet(crc, next, -79); + FALLTHROUGH_INTENDED; + case 78: + CRCtriplet(crc, next, -78); + FALLTHROUGH_INTENDED; + case 77: + CRCtriplet(crc, next, -77); + FALLTHROUGH_INTENDED; + case 76: + CRCtriplet(crc, next, -76); + FALLTHROUGH_INTENDED; + case 75: + CRCtriplet(crc, next, -75); + FALLTHROUGH_INTENDED; + case 74: + CRCtriplet(crc, next, -74); + FALLTHROUGH_INTENDED; + case 73: + CRCtriplet(crc, next, -73); + FALLTHROUGH_INTENDED; + case 72: + CRCtriplet(crc, next, -72); + FALLTHROUGH_INTENDED; + case 71: + CRCtriplet(crc, next, -71); + FALLTHROUGH_INTENDED; + case 70: + CRCtriplet(crc, next, -70); + FALLTHROUGH_INTENDED; + case 69: + CRCtriplet(crc, next, -69); + FALLTHROUGH_INTENDED; + case 68: + CRCtriplet(crc, next, -68); + FALLTHROUGH_INTENDED; + case 67: + CRCtriplet(crc, next, -67); + FALLTHROUGH_INTENDED; + case 66: + CRCtriplet(crc, next, -66); + FALLTHROUGH_INTENDED; + case 65: + CRCtriplet(crc, next, -65); + FALLTHROUGH_INTENDED; + case 64: + CRCtriplet(crc, next, -64); + FALLTHROUGH_INTENDED; + case 63: + CRCtriplet(crc, next, -63); + FALLTHROUGH_INTENDED; + case 62: + CRCtriplet(crc, next, -62); + FALLTHROUGH_INTENDED; + case 61: + CRCtriplet(crc, next, -61); + FALLTHROUGH_INTENDED; + case 60: + CRCtriplet(crc, next, -60); + FALLTHROUGH_INTENDED; + case 59: + CRCtriplet(crc, next, -59); + FALLTHROUGH_INTENDED; + case 58: + CRCtriplet(crc, next, -58); + FALLTHROUGH_INTENDED; + case 57: + CRCtriplet(crc, next, -57); + FALLTHROUGH_INTENDED; + case 56: + CRCtriplet(crc, next, -56); + FALLTHROUGH_INTENDED; + case 55: + CRCtriplet(crc, next, -55); + FALLTHROUGH_INTENDED; + case 54: + CRCtriplet(crc, next, -54); + FALLTHROUGH_INTENDED; + case 53: + CRCtriplet(crc, next, -53); + FALLTHROUGH_INTENDED; + case 52: + CRCtriplet(crc, next, -52); + FALLTHROUGH_INTENDED; + case 51: + CRCtriplet(crc, next, -51); + FALLTHROUGH_INTENDED; + case 50: + CRCtriplet(crc, next, -50); + FALLTHROUGH_INTENDED; + case 49: + CRCtriplet(crc, next, -49); + FALLTHROUGH_INTENDED; + case 48: + CRCtriplet(crc, next, -48); + FALLTHROUGH_INTENDED; + case 47: + CRCtriplet(crc, next, -47); + FALLTHROUGH_INTENDED; + case 46: + CRCtriplet(crc, next, -46); + FALLTHROUGH_INTENDED; + case 45: + CRCtriplet(crc, next, -45); + FALLTHROUGH_INTENDED; + case 44: + CRCtriplet(crc, next, -44); + FALLTHROUGH_INTENDED; + case 43: + CRCtriplet(crc, next, -43); + FALLTHROUGH_INTENDED; + case 42: + CRCtriplet(crc, next, -42); + FALLTHROUGH_INTENDED; + case 41: + CRCtriplet(crc, next, -41); + FALLTHROUGH_INTENDED; + case 40: + CRCtriplet(crc, next, -40); + FALLTHROUGH_INTENDED; + case 39: + CRCtriplet(crc, next, -39); + FALLTHROUGH_INTENDED; + case 38: + CRCtriplet(crc, next, -38); + FALLTHROUGH_INTENDED; + case 37: + CRCtriplet(crc, next, -37); + FALLTHROUGH_INTENDED; + case 36: + CRCtriplet(crc, next, -36); + FALLTHROUGH_INTENDED; + case 35: + CRCtriplet(crc, next, -35); + FALLTHROUGH_INTENDED; + case 34: + CRCtriplet(crc, next, -34); + FALLTHROUGH_INTENDED; + case 33: + CRCtriplet(crc, next, -33); + FALLTHROUGH_INTENDED; + case 32: + CRCtriplet(crc, next, -32); + FALLTHROUGH_INTENDED; + case 31: + CRCtriplet(crc, next, -31); + FALLTHROUGH_INTENDED; + case 30: + CRCtriplet(crc, next, -30); + FALLTHROUGH_INTENDED; + case 29: + CRCtriplet(crc, next, -29); + FALLTHROUGH_INTENDED; + case 28: + CRCtriplet(crc, next, -28); + FALLTHROUGH_INTENDED; + case 27: + CRCtriplet(crc, next, -27); + FALLTHROUGH_INTENDED; + case 26: + CRCtriplet(crc, next, -26); + FALLTHROUGH_INTENDED; + case 25: + CRCtriplet(crc, next, -25); + FALLTHROUGH_INTENDED; + case 24: + CRCtriplet(crc, next, -24); + FALLTHROUGH_INTENDED; + case 23: + CRCtriplet(crc, next, -23); + FALLTHROUGH_INTENDED; + case 22: + CRCtriplet(crc, next, -22); + FALLTHROUGH_INTENDED; + case 21: + CRCtriplet(crc, next, -21); + FALLTHROUGH_INTENDED; + case 20: + CRCtriplet(crc, next, -20); + FALLTHROUGH_INTENDED; + case 19: + CRCtriplet(crc, next, -19); + FALLTHROUGH_INTENDED; + case 18: + CRCtriplet(crc, next, -18); + FALLTHROUGH_INTENDED; + case 17: + CRCtriplet(crc, next, -17); + FALLTHROUGH_INTENDED; + case 16: + CRCtriplet(crc, next, -16); + FALLTHROUGH_INTENDED; + case 15: + CRCtriplet(crc, next, -15); + FALLTHROUGH_INTENDED; + case 14: + CRCtriplet(crc, next, -14); + FALLTHROUGH_INTENDED; + case 13: + CRCtriplet(crc, next, -13); + FALLTHROUGH_INTENDED; + case 12: + CRCtriplet(crc, next, -12); + FALLTHROUGH_INTENDED; + case 11: + CRCtriplet(crc, next, -11); + FALLTHROUGH_INTENDED; + case 10: + CRCtriplet(crc, next, -10); + FALLTHROUGH_INTENDED; + case 9: + CRCtriplet(crc, next, -9); + FALLTHROUGH_INTENDED; + case 8: + CRCtriplet(crc, next, -8); + FALLTHROUGH_INTENDED; + case 7: + CRCtriplet(crc, next, -7); + FALLTHROUGH_INTENDED; + case 6: + CRCtriplet(crc, next, -6); + FALLTHROUGH_INTENDED; + case 5: + CRCtriplet(crc, next, -5); + FALLTHROUGH_INTENDED; + case 4: + CRCtriplet(crc, next, -4); + FALLTHROUGH_INTENDED; + case 3: + CRCtriplet(crc, next, -3); + FALLTHROUGH_INTENDED; + case 2: + CRCtriplet(crc, next, -2); + FALLTHROUGH_INTENDED; + case 1: + CRCduplet(crc, next, -1); // the final triplet is actually only 2 + //{ CombineCRC(); } + crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2); + if (--n > 0) { + crc1 = crc2 = 0; + block_size = 128; + // points to the first byte of the next block + next0 = next2 + 128; + next1 = next0 + 128; // from here on all blocks are 128 long + next2 = next1 + 128; + } + FALLTHROUGH_INTENDED; + case 0:; + } while (n > 0); + } + next = (const unsigned char*)next2; + } + uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets + len = len & 7; + next += (count2 * 8); + switch (count2) { + case 27: + CRCsinglet(crc0, next, -27 * 8); + FALLTHROUGH_INTENDED; + case 26: + CRCsinglet(crc0, next, -26 * 8); + FALLTHROUGH_INTENDED; + case 25: + CRCsinglet(crc0, next, -25 * 8); + FALLTHROUGH_INTENDED; + case 24: + CRCsinglet(crc0, next, -24 * 8); + FALLTHROUGH_INTENDED; + case 23: + CRCsinglet(crc0, next, -23 * 8); + FALLTHROUGH_INTENDED; + case 22: + CRCsinglet(crc0, next, -22 * 8); + FALLTHROUGH_INTENDED; + case 21: + CRCsinglet(crc0, next, -21 * 8); + FALLTHROUGH_INTENDED; + case 20: + CRCsinglet(crc0, next, -20 * 8); + FALLTHROUGH_INTENDED; + case 19: + CRCsinglet(crc0, next, -19 * 8); + FALLTHROUGH_INTENDED; + case 18: + CRCsinglet(crc0, next, -18 * 8); + FALLTHROUGH_INTENDED; + case 17: + CRCsinglet(crc0, next, -17 * 8); + FALLTHROUGH_INTENDED; + case 16: + CRCsinglet(crc0, next, -16 * 8); + FALLTHROUGH_INTENDED; + case 15: + CRCsinglet(crc0, next, -15 * 8); + FALLTHROUGH_INTENDED; + case 14: + CRCsinglet(crc0, next, -14 * 8); + FALLTHROUGH_INTENDED; + case 13: + CRCsinglet(crc0, next, -13 * 8); + FALLTHROUGH_INTENDED; + case 12: + CRCsinglet(crc0, next, -12 * 8); + FALLTHROUGH_INTENDED; + case 11: + CRCsinglet(crc0, next, -11 * 8); + FALLTHROUGH_INTENDED; + case 10: + CRCsinglet(crc0, next, -10 * 8); + FALLTHROUGH_INTENDED; + case 9: + CRCsinglet(crc0, next, -9 * 8); + FALLTHROUGH_INTENDED; + case 8: + CRCsinglet(crc0, next, -8 * 8); + FALLTHROUGH_INTENDED; + case 7: + CRCsinglet(crc0, next, -7 * 8); + FALLTHROUGH_INTENDED; + case 6: + CRCsinglet(crc0, next, -6 * 8); + FALLTHROUGH_INTENDED; + case 5: + CRCsinglet(crc0, next, -5 * 8); + FALLTHROUGH_INTENDED; + case 4: + CRCsinglet(crc0, next, -4 * 8); + FALLTHROUGH_INTENDED; + case 3: + CRCsinglet(crc0, next, -3 * 8); + FALLTHROUGH_INTENDED; + case 2: + CRCsinglet(crc0, next, -2 * 8); + FALLTHROUGH_INTENDED; + case 1: + CRCsinglet(crc0, next, -1 * 8); + FALLTHROUGH_INTENDED; + case 0:; + } + } + { + align_to_8(len, crc0, next); + return (uint32_t)crc0 ^ 0xffffffffu; + } +} + +#endif //HAVE_SSE42 && HAVE_PCLMUL + +static inline Function Choose_Extend() { +#ifndef HAVE_POWER8 + if (isSSE42()) { + if (isPCLMULQDQ()) { +#if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C + return crc32c_3way; +#else + return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself +#endif + } + else { // no runtime PCLMULQDQ support but has SSE42 support + return ExtendImpl<Fast_CRC32>; + } + } // end of isSSE42() + else { + return ExtendImpl<Slow_CRC32>; + } +#else //HAVE_POWER8 + return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>; +#endif +} + +static Function ChosenExtend = Choose_Extend(); +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + return ChosenExtend(crc, buf, size); +} + + +} // namespace crc32c +} // namespace rocksdb diff --git a/src/rocksdb/util/crc32c.h b/src/rocksdb/util/crc32c.h new file mode 100644 index 00000000..faee5d54 --- /dev/null +++ b/src/rocksdb/util/crc32c.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stddef.h> +#include <stdint.h> +#include <string> + +namespace rocksdb { +namespace crc32c { + +extern std::string IsFastCrc32Supported(); + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} // namespace crc32c +} // namespace rocksdb diff --git a/src/rocksdb/util/crc32c_ppc.c b/src/rocksdb/util/crc32c_ppc.c new file mode 100644 index 00000000..3c517c88 --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc.c @@ -0,0 +1,95 @@ +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// This source code is also licensed under the GPLv2 license found in the +// COPYING file in the root directory of this source tree. + +#define CRC_TABLE +#include <inttypes.h> +#include <stdlib.h> +#include <strings.h> +#include "util/crc32c_ppc_constants.h" + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN - 1) + +#ifdef REFLECT +static unsigned int crc32_align(unsigned int crc, unsigned char const *p, + unsigned long len) { + while (len--) crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + return crc; +} +#endif + +#ifdef HAVE_POWER8 +unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p, + unsigned long len); + +static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data, + unsigned len) { + unsigned int prealign; + unsigned int tail; + +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + if (len < VMX_ALIGN + VMX_ALIGN_MASK) { + crc = crc32_align(crc, data, (unsigned long)len); + goto out; + } + + if ((unsigned long)data & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK); + crc = crc32_align(crc, data, prealign); + len -= prealign; + data += prealign; + } + + crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK); + + tail = len & VMX_ALIGN_MASK; + if (tail) { + data += len & ~VMX_ALIGN_MASK; + crc = crc32_align(crc, data, tail); + } + +out: +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + return crc; +} + +/* This wrapper function works around the fact that crc32_vpmsum + * does not gracefully handle the case where the data pointer is NULL. There + * may be room for performance improvement here. + */ +uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) { + unsigned char *buf2; + + if (!data) { + buf2 = (unsigned char *)malloc(len); + bzero(buf2, len); + crc = crc32_vpmsum(crc, buf2, len); + free(buf2); + } else { + crc = crc32_vpmsum(crc, data, (unsigned long)len); + } + return crc; +} + +#else /* HAVE_POWER8 */ + +/* This symbol has to exist on non-ppc architectures (and on legacy + * ppc systems using power7 or below) in order to compile properly + * there, even though it won't be called. + */ +uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) { + return 0; +} + +#endif /* HAVE_POWER8 */ diff --git a/src/rocksdb/util/crc32c_ppc.h b/src/rocksdb/util/crc32c_ppc.h new file mode 100644 index 00000000..3bcaecfe --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc.h @@ -0,0 +1,20 @@ +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// This source code is also licensed under the GPLv2 license found in the +// COPYING file in the root directory of this source tree. + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, + unsigned len); + +#ifdef __cplusplus +} +#endif diff --git a/src/rocksdb/util/crc32c_ppc_asm.S b/src/rocksdb/util/crc32c_ppc_asm.S new file mode 100644 index 00000000..6de79797 --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc_asm.S @@ -0,0 +1,753 @@ +// Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// This source code is also licensed under the GPLv2 license found in the +// COPYING file in the root directory of this source tree. + +#include <ppc-asm.h> +#include "ppc-opcode.h" + +#undef toc + +#ifndef r1 +#define r1 1 +#endif + +#ifndef r2 +#define r2 2 +#endif + + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +#define __ASSEMBLY__ +#include "crc32c_ppc_constants.h" + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(__crc32_vpmsum) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, r3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + addis r3,r2,.byteswap_constant@toc@ha + addi r3,r3,.byteswap_constant@toc@l + + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + addis r3,r2,.constants@toc@ha + addi r3,r3,.constants@toc@l + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + addis r3,r2,.barrett_constants@toc@ha + addi r3,r3,.barrett_constants@toc@l + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit + +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(r3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + addis r3,r2,.short_constants@toc@ha + addi r3,r3,.short_constants@toc@l + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(__crc32_vpmsum) diff --git a/src/rocksdb/util/crc32c_ppc_constants.h b/src/rocksdb/util/crc32c_ppc_constants.h new file mode 100644 index 00000000..57d66303 --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc_constants.h @@ -0,0 +1,901 @@ +// Copyright (C) 2015, 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// This source code is also licensed under the GPLv2 license found in the +// COPYING file in the root directory of this source tree. + +#pragma once + +#define CRC 0x1edc6f41 +#define REFLECT +#define CRC_XOR + +#ifndef __ASSEMBLY__ +#ifdef CRC_TABLE +static const unsigned int crc_table[] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, +}; + +#endif + +#else +#define MAX_SIZE 32768 +.constants : + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + + .short_constants : + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include + the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod + p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod + p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod + p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod + p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod + p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod + p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod + p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod + p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` + */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + .barrett_constants : + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 +#endif diff --git a/src/rocksdb/util/crc32c_test.cc b/src/rocksdb/util/crc32c_test.cc new file mode 100644 index 00000000..d5983586 --- /dev/null +++ b/src/rocksdb/util/crc32c_test.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "util/crc32c.h" +#include "util/testharness.h" +#include "util/coding.h" + +namespace rocksdb { +namespace crc32c { + +class CRC { }; + + +// Tests for 3-way crc32c algorithm. We need these tests because it uses +// different lookup tables than the original Fast_CRC32 +const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t); +char buffer[BUFFER_SIZE]; + +struct ExpectedResult { + size_t offset; + size_t length; + uint32_t crc32c; +}; + +ExpectedResult expectedResults[] = { + // Zero-byte input + { 0, 0, ~0U }, + // Small aligned inputs to test special cases in SIMD implementations + { 8, 1, 1543413366 }, + { 8, 2, 523493126 }, + { 8, 3, 1560427360 }, + { 8, 4, 3422504776 }, + { 8, 5, 447841138 }, + { 8, 6, 3910050499 }, + { 8, 7, 3346241981 }, + // Small unaligned inputs + { 9, 1, 3855826643 }, + { 10, 2, 560880875 }, + { 11, 3, 1479707779 }, + { 12, 4, 2237687071 }, + { 13, 5, 4063855784 }, + { 14, 6, 2553454047 }, + { 15, 7, 1349220140 }, + // Larger inputs to test leftover chunks at the end of aligned blocks + { 8, 8, 627613930 }, + { 8, 9, 2105929409 }, + { 8, 10, 2447068514 }, + { 8, 11, 863807079 }, + { 8, 12, 292050879 }, + { 8, 13, 1411837737 }, + { 8, 14, 2614515001 }, + { 8, 15, 3579076296 }, + { 8, 16, 2897079161 }, + { 8, 17, 675168386 }, + // // Much larger inputs + { 0, BUFFER_SIZE, 2096790750 }, + { 1, BUFFER_SIZE / 2, 3854797577 }, + +}; + +TEST(CRC, StandardResults) { + + // Original Fast_CRC32 tests. + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = static_cast<char>(i); + } + ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = static_cast<char>(31 - i); + } + ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data))); + + // 3-Way Crc32c tests ported from folly. + // Test 1: single computation + for (auto expected : expectedResults) { + uint32_t result = Value(buffer + expected.offset, expected.length); + EXPECT_EQ(~expected.crc32c, result); + } + + // Test 2: stitching two computations + for (auto expected : expectedResults) { + size_t partialLength = expected.length / 2; + uint32_t partialChecksum = Value(buffer + expected.offset, partialLength); + uint32_t result = Extend(partialChecksum, + buffer + expected.offset + partialLength, + expected.length - partialLength); + EXPECT_EQ(~expected.crc32c, result); + } + +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +} // namespace crc32c +} // namespace rocksdb + +// copied from folly +const uint64_t FNV_64_HASH_START = 14695981039346656037ULL; +inline uint64_t fnv64_buf(const void* buf, + size_t n, + uint64_t hash = FNV_64_HASH_START) { + // forcing signed char, since other platforms can use unsigned + const signed char* char_buf = reinterpret_cast<const signed char*>(buf); + + for (size_t i = 0; i < n; ++i) { + hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) + + (hash << 8) + (hash << 40); + hash ^= char_buf[i]; + } + return hash; +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + // Populate a buffer with a deterministic pattern + // on which to compute checksums + + const uint8_t* src = (uint8_t*)rocksdb::crc32c::buffer; + uint64_t* dst = (uint64_t*)rocksdb::crc32c::buffer; + const uint64_t* end = (const uint64_t*)(rocksdb::crc32c::buffer + rocksdb::crc32c::BUFFER_SIZE); + *dst++ = 0; + while (dst < end) { + rocksdb::EncodeFixed64(reinterpret_cast<char*>(dst), fnv64_buf((const char*)src, sizeof(uint64_t))); + dst++; + src += sizeof(uint64_t); + } + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/delete_scheduler.cc b/src/rocksdb/util/delete_scheduler.cc new file mode 100644 index 00000000..f5ee2844 --- /dev/null +++ b/src/rocksdb/util/delete_scheduler.cc @@ -0,0 +1,353 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "util/delete_scheduler.h" + +#include <thread> +#include <vector> + +#include "port/port.h" +#include "rocksdb/env.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/sst_file_manager_impl.h" +#include "util/sync_point.h" + +namespace rocksdb { + +DeleteScheduler::DeleteScheduler(Env* env, int64_t rate_bytes_per_sec, + Logger* info_log, + SstFileManagerImpl* sst_file_manager, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) + : env_(env), + total_trash_size_(0), + rate_bytes_per_sec_(rate_bytes_per_sec), + pending_files_(0), + bytes_max_delete_chunk_(bytes_max_delete_chunk), + closing_(false), + cv_(&mu_), + info_log_(info_log), + sst_file_manager_(sst_file_manager), + max_trash_db_ratio_(max_trash_db_ratio) { + assert(sst_file_manager != nullptr); + assert(max_trash_db_ratio >= 0); + bg_thread_.reset( + new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this)); +} + +DeleteScheduler::~DeleteScheduler() { + { + InstrumentedMutexLock l(&mu_); + closing_ = true; + cv_.SignalAll(); + } + if (bg_thread_) { + bg_thread_->join(); + } +} + +Status DeleteScheduler::DeleteFile(const std::string& file_path, + const std::string& dir_to_sync, + const bool force_bg) { + Status s; + if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && + total_trash_size_.load() > + sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) { + // Rate limiting is disabled or trash size makes up more than + // max_trash_db_ratio_ (default 25%) of the total DB size + TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); + s = env_->DeleteFile(file_path); + if (s.ok()) { + sst_file_manager_->OnDeleteFile(file_path); + } + return s; + } + + // Move file to trash + std::string trash_file; + s = MarkAsTrash(file_path, &trash_file); + + if (!s.ok()) { + ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash", file_path.c_str()); + s = env_->DeleteFile(file_path); + if (s.ok()) { + sst_file_manager_->OnDeleteFile(file_path); + } + return s; + } + + // Update the total trash size + uint64_t trash_file_size = 0; + env_->GetFileSize(trash_file, &trash_file_size); + total_trash_size_.fetch_add(trash_file_size); + + // Add file to delete queue + { + InstrumentedMutexLock l(&mu_); + queue_.emplace(trash_file, dir_to_sync); + pending_files_++; + if (pending_files_ == 1) { + cv_.SignalAll(); + } + } + return s; +} + +std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() { + InstrumentedMutexLock l(&mu_); + return bg_errors_; +} + +const std::string DeleteScheduler::kTrashExtension = ".trash"; +bool DeleteScheduler::IsTrashFile(const std::string& file_path) { + return (file_path.size() >= kTrashExtension.size() && + file_path.rfind(kTrashExtension) == + file_path.size() - kTrashExtension.size()); +} + +Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm, + const std::string& path) { + Status s; + // Check if there are any files marked as trash in this path + std::vector<std::string> files_in_path; + s = env->GetChildren(path, &files_in_path); + if (!s.ok()) { + return s; + } + for (const std::string& current_file : files_in_path) { + if (!DeleteScheduler::IsTrashFile(current_file)) { + // not a trash file, skip + continue; + } + + Status file_delete; + std::string trash_file = path + "/" + current_file; + if (sfm) { + // We have an SstFileManager that will schedule the file delete + sfm->OnAddFile(trash_file); + file_delete = sfm->ScheduleFileDeletion(trash_file, path); + } else { + // Delete the file immediately + file_delete = env->DeleteFile(trash_file); + } + + if (s.ok() && !file_delete.ok()) { + s = file_delete; + } + } + + return s; +} + +Status DeleteScheduler::MarkAsTrash(const std::string& file_path, + std::string* trash_file) { + // Sanity check of the path + size_t idx = file_path.rfind("/"); + if (idx == std::string::npos || idx == file_path.size() - 1) { + return Status::InvalidArgument("file_path is corrupted"); + } + + Status s; + if (DeleteScheduler::IsTrashFile(file_path)) { + // This is already a trash file + *trash_file = file_path; + return s; + } + + *trash_file = file_path + kTrashExtension; + // TODO(tec) : Implement Env::RenameFileIfNotExist and remove + // file_move_mu mutex. + int cnt = 0; + InstrumentedMutexLock l(&file_move_mu_); + while (true) { + s = env_->FileExists(*trash_file); + if (s.IsNotFound()) { + // We found a path for our file in trash + s = env_->RenameFile(file_path, *trash_file); + break; + } else if (s.ok()) { + // Name conflict, generate new random suffix + *trash_file = file_path + std::to_string(cnt) + kTrashExtension; + } else { + // Error during FileExists call, we cannot continue + break; + } + cnt++; + } + if (s.ok()) { + sst_file_manager_->OnMoveFile(file_path, *trash_file); + } + return s; +} + +void DeleteScheduler::BackgroundEmptyTrash() { + TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash"); + + while (true) { + InstrumentedMutexLock l(&mu_); + while (queue_.empty() && !closing_) { + cv_.Wait(); + } + + if (closing_) { + return; + } + + // Delete all files in queue_ + uint64_t start_time = env_->NowMicros(); + uint64_t total_deleted_bytes = 0; + int64_t current_delete_rate = rate_bytes_per_sec_.load(); + while (!queue_.empty() && !closing_) { + if (current_delete_rate != rate_bytes_per_sec_.load()) { + // User changed the delete rate + current_delete_rate = rate_bytes_per_sec_.load(); + start_time = env_->NowMicros(); + total_deleted_bytes = 0; + } + + // Get new file to delete + const FileAndDir& fad = queue_.front(); + std::string path_in_trash = fad.fname; + + // We dont need to hold the lock while deleting the file + mu_.Unlock(); + uint64_t deleted_bytes = 0; + bool is_complete = true; + // Delete file from trash and update total_penlty value + Status s = + DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete); + total_deleted_bytes += deleted_bytes; + mu_.Lock(); + if (is_complete) { + queue_.pop(); + } + + if (!s.ok()) { + bg_errors_[path_in_trash] = s; + } + + // Apply penlty if necessary + uint64_t total_penlty; + if (current_delete_rate > 0) { + // rate limiting is enabled + total_penlty = + ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate); + while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {} + } else { + // rate limiting is disabled + total_penlty = 0; + } + TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait", + &total_penlty); + + if (is_complete) { + pending_files_--; + } + if (pending_files_ == 0) { + // Unblock WaitForEmptyTrash since there are no more files waiting + // to be deleted + cv_.SignalAll(); + } + } + } +} + +Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, + const std::string& dir_to_sync, + uint64_t* deleted_bytes, + bool* is_complete) { + uint64_t file_size; + Status s = env_->GetFileSize(path_in_trash, &file_size); + *is_complete = true; + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile"); + if (s.ok()) { + bool need_full_delete = true; + if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { + uint64_t num_hard_links = 2; + // We don't have to worry aobut data race between linking a new + // file after the number of file link check and ftruncte because + // the file is now in trash and no hardlink is supposed to create + // to trash files by RocksDB. + Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links); + if (my_status.ok()) { + if (num_hard_links == 1) { + std::unique_ptr<WritableFile> wf; + my_status = + env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions()); + if (my_status.ok()) { + my_status = wf->Truncate(file_size - bytes_max_delete_chunk_); + if (my_status.ok()) { + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync"); + my_status = wf->Fsync(); + } + } + if (my_status.ok()) { + *deleted_bytes = bytes_max_delete_chunk_; + need_full_delete = false; + *is_complete = false; + } else { + ROCKS_LOG_WARN(info_log_, + "Failed to partially delete %s from trash -- %s", + path_in_trash.c_str(), my_status.ToString().c_str()); + } + } else { + ROCKS_LOG_INFO(info_log_, + "Cannot delete %s slowly through ftruncate from trash " + "as it has other links", + path_in_trash.c_str()); + } + } else if (!num_link_error_printed_) { + ROCKS_LOG_INFO( + info_log_, + "Cannot delete files slowly through ftruncate from trash " + "as Env::NumFileLinks() returns error: %s", + my_status.ToString().c_str()); + num_link_error_printed_ = true; + } + } + + if (need_full_delete) { + s = env_->DeleteFile(path_in_trash); + if (!dir_to_sync.empty()) { + std::unique_ptr<Directory> dir_obj; + if (s.ok()) { + s = env_->NewDirectory(dir_to_sync, &dir_obj); + } + if (s.ok()) { + s = dir_obj->Fsync(); + TEST_SYNC_POINT_CALLBACK( + "DeleteScheduler::DeleteTrashFile::AfterSyncDir", + reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync))); + } + } + *deleted_bytes = file_size; + sst_file_manager_->OnDeleteFile(path_in_trash); + } + } + if (!s.ok()) { + // Error while getting file size or while deleting + ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s", + path_in_trash.c_str(), s.ToString().c_str()); + *deleted_bytes = 0; + } else { + total_trash_size_.fetch_sub(*deleted_bytes); + } + + return s; +} + +void DeleteScheduler::WaitForEmptyTrash() { + InstrumentedMutexLock l(&mu_); + while (pending_files_ > 0 && !closing_) { + cv_.Wait(); + } +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/util/delete_scheduler.h b/src/rocksdb/util/delete_scheduler.h new file mode 100644 index 00000000..29b70517 --- /dev/null +++ b/src/rocksdb/util/delete_scheduler.h @@ -0,0 +1,138 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <map> +#include <queue> +#include <string> +#include <thread> + +#include "monitoring/instrumented_mutex.h" +#include "port/port.h" + +#include "rocksdb/status.h" + +namespace rocksdb { + +class Env; +class Logger; +class SstFileManagerImpl; + +// DeleteScheduler allows the DB to enforce a rate limit on file deletion, +// Instead of deleteing files immediately, files are marked as trash +// and deleted in a background thread that apply sleep penlty between deletes +// if they are happening in a rate faster than rate_bytes_per_sec, +// +// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this +// case DeleteScheduler will delete files immediately. +class DeleteScheduler { + public: + DeleteScheduler(Env* env, int64_t rate_bytes_per_sec, Logger* info_log, + SstFileManagerImpl* sst_file_manager, + double max_trash_db_ratio, uint64_t bytes_max_delete_chunk); + + ~DeleteScheduler(); + + // Return delete rate limit in bytes per second + int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_.load(); } + + // Set delete rate limit in bytes per second + void SetRateBytesPerSecond(int64_t bytes_per_sec) { + rate_bytes_per_sec_.store(bytes_per_sec); + } + + // Mark file as trash directory and schedule it's deletion. If force_bg is + // set, it forces the file to always be deleted in the background thread, + // except when rate limiting is disabled + Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, + const bool force_bg = false); + + // Wait for all files being deleteing in the background to finish or for + // destructor to be called. + void WaitForEmptyTrash(); + + // Return a map containing errors that happened in BackgroundEmptyTrash + // file_path => error status + std::map<std::string, Status> GetBackgroundErrors(); + + uint64_t GetTotalTrashSize() { return total_trash_size_.load(); } + + // Return trash/DB size ratio where new files will be deleted immediately + double GetMaxTrashDBRatio() { + return max_trash_db_ratio_.load(); + } + + // Update trash/DB size ratio where new files will be deleted immediately + void SetMaxTrashDBRatio(double r) { + assert(r >= 0); + max_trash_db_ratio_.store(r); + } + + static const std::string kTrashExtension; + static bool IsTrashFile(const std::string& file_path); + + // Check if there are any .trash filse in path, and schedule their deletion + // Or delete immediately if sst_file_manager is nullptr + static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm, + const std::string& path); + + private: + Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash); + + Status DeleteTrashFile(const std::string& path_in_trash, + const std::string& dir_to_sync, + uint64_t* deleted_bytes, bool* is_complete); + + void BackgroundEmptyTrash(); + + Env* env_; + // total size of trash files + std::atomic<uint64_t> total_trash_size_; + // Maximum number of bytes that should be deleted per second + std::atomic<int64_t> rate_bytes_per_sec_; + // Mutex to protect queue_, pending_files_, bg_errors_, closing_ + InstrumentedMutex mu_; + + struct FileAndDir { + FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {} + std::string fname; + std::string dir; // empty will be skipped. + }; + + // Queue of trash files that need to be deleted + std::queue<FileAndDir> queue_; + // Number of trash files that are waiting to be deleted + int32_t pending_files_; + uint64_t bytes_max_delete_chunk_; + // Errors that happened in BackgroundEmptyTrash (file_path => error) + std::map<std::string, Status> bg_errors_; + + bool num_link_error_printed_ = false; + // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop + bool closing_; + // Condition variable signaled in these conditions + // - pending_files_ value change from 0 => 1 + // - pending_files_ value change from 1 => 0 + // - closing_ value is set to true + InstrumentedCondVar cv_; + // Background thread running BackgroundEmptyTrash + std::unique_ptr<port::Thread> bg_thread_; + // Mutex to protect threads from file name conflicts + InstrumentedMutex file_move_mu_; + Logger* info_log_; + SstFileManagerImpl* sst_file_manager_; + // If the trash size constitutes for more than this fraction of the total DB + // size we will start deleting new files passed to DeleteScheduler + // immediately + std::atomic<double> max_trash_db_ratio_; + static const uint64_t kMicrosInSecond = 1000 * 1000LL; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/util/delete_scheduler_test.cc b/src/rocksdb/util/delete_scheduler_test.cc new file mode 100644 index 00000000..0d8e354b --- /dev/null +++ b/src/rocksdb/util/delete_scheduler_test.cc @@ -0,0 +1,696 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> +#include <atomic> +#include <thread> +#include <vector> + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "util/delete_scheduler.h" +#include "util/sst_file_manager_impl.h" +#include "util/string_util.h" +#include "util/sync_point.h" +#include "util/testharness.h" +#include "util/testutil.h" + +#ifndef ROCKSDB_LITE + +namespace rocksdb { + +class DeleteSchedulerTest : public testing::Test { + public: + DeleteSchedulerTest() : env_(Env::Default()) { + const int kNumDataDirs = 3; + dummy_files_dirs_.reserve(kNumDataDirs); + for (size_t i = 0; i < kNumDataDirs; ++i) { + dummy_files_dirs_.emplace_back( + test::PerThreadDBPath(env_, "delete_scheduler_dummy_data_dir") + + ToString(i)); + DestroyAndCreateDir(dummy_files_dirs_.back()); + } + } + + ~DeleteSchedulerTest() override { + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->LoadDependency({}); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + for (const auto& dummy_files_dir : dummy_files_dirs_) { + test::DestroyDir(env_, dummy_files_dir); + } + } + + void DestroyAndCreateDir(const std::string& dir) { + ASSERT_OK(test::DestroyDir(env_, dir)); + EXPECT_OK(env_->CreateDir(dir)); + } + + int CountNormalFiles(size_t dummy_files_dirs_idx = 0) { + std::vector<std::string> files_in_dir; + EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx], + &files_in_dir)); + + int normal_cnt = 0; + for (auto& f : files_in_dir) { + if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") { + normal_cnt++; + } + } + return normal_cnt; + } + + int CountTrashFiles(size_t dummy_files_dirs_idx = 0) { + std::vector<std::string> files_in_dir; + EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx], + &files_in_dir)); + + int trash_cnt = 0; + for (auto& f : files_in_dir) { + if (DeleteScheduler::IsTrashFile(f)) { + trash_cnt++; + } + } + return trash_cnt; + } + + std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024, + size_t dummy_files_dirs_idx = 0) { + std::string file_path = + dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name; + std::unique_ptr<WritableFile> f; + env_->NewWritableFile(file_path, &f, EnvOptions()); + std::string data(size, 'A'); + EXPECT_OK(f->Append(data)); + EXPECT_OK(f->Close()); + sst_file_mgr_->OnAddFile(file_path, false); + return file_path; + } + + void NewDeleteScheduler() { + // Tests in this file are for DeleteScheduler component and dont create any + // DBs, so we need to set max_trash_db_ratio to 100% (instead of default + // 25%) + sst_file_mgr_.reset( + new SstFileManagerImpl(env_, nullptr, rate_bytes_per_sec_, + /* max_trash_db_ratio= */ 1.1, 128 * 1024)); + delete_scheduler_ = sst_file_mgr_->delete_scheduler(); + } + + Env* env_; + std::vector<std::string> dummy_files_dirs_; + int64_t rate_bytes_per_sec_; + DeleteScheduler* delete_scheduler_; + std::unique_ptr<SstFileManagerImpl> sst_file_mgr_; +}; + +// Test the basic functionality of DeleteScheduler (Rate Limiting). +// 1- Create 100 dummy files +// 2- Delete the 100 dummy files using DeleteScheduler +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- +// 3- Wait for DeleteScheduler to delete all files in trash +// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files +// 5- Make sure that all created files were completely deleted +TEST_F(DeleteSchedulerTest, BasicRateLimiting) { + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DeleteSchedulerTest::BasicRateLimiting:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + + std::vector<uint64_t> penalties; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); }); + int dir_synced = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) { + dir_synced++; + std::string* dir = reinterpret_cast<std::string*>(arg); + EXPECT_EQ(dummy_files_dirs_[0], *dir); + }); + + int num_files = 100; // 100 files + uint64_t file_size = 1024; // every file is 1 kb + std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25}; + + for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) { + penalties.clear(); + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndCreateDir(dummy_files_dirs_[0]); + rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; + NewDeleteScheduler(); + + dir_synced = 0; + // Create 100 dummy files, every file is 1 Kb + std::vector<std::string> generated_files; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file" + ToString(i) + ".data"; + generated_files.push_back(NewDummyFile(file_name, file_size)); + } + + // Delete dummy files and measure time spent to empty trash + for (int i = 0; i < num_files; i++) { + ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], + dummy_files_dirs_[0])); + } + ASSERT_EQ(CountNormalFiles(), 0); + + uint64_t delete_start_time = env_->NowMicros(); + TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1"); + delete_scheduler_->WaitForEmptyTrash(); + uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + + uint64_t total_files_size = 0; + uint64_t expected_penlty = 0; + ASSERT_EQ(penalties.size(), num_files); + for (int i = 0; i < num_files; i++) { + total_files_size += file_size; + expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_); + ASSERT_EQ(expected_penlty, penalties[i]); + } + ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); + + ASSERT_EQ(num_files, dir_synced); + + ASSERT_EQ(CountTrashFiles(), 0); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) { + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + rate_bytes_per_sec_ = 1 << 20; // 1MB + NewDeleteScheduler(); + + // Generate dummy files in multiple directories + const size_t kNumFiles = dummy_files_dirs_.size(); + const size_t kFileSize = 1 << 10; // 1KB + std::vector<std::string> generated_files; + for (size_t i = 0; i < kNumFiles; i++) { + generated_files.push_back(NewDummyFile("file", kFileSize, i)); + ASSERT_EQ(1, CountNormalFiles(i)); + } + + // Mark dummy files as trash + for (size_t i = 0; i < kNumFiles; i++) { + ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], "")); + ASSERT_EQ(0, CountNormalFiles(i)); + ASSERT_EQ(1, CountTrashFiles(i)); + } + TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1"); + delete_scheduler_->WaitForEmptyTrash(); + + // Verify dummy files eventually got deleted + for (size_t i = 0; i < kNumFiles; i++) { + ASSERT_EQ(0, CountNormalFiles(i)); + ASSERT_EQ(0, CountTrashFiles(i)); + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Same as the BasicRateLimiting test but delete files in multiple threads. +// 1- Create 100 dummy files +// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- +// 3- Wait for DeleteScheduler to delete all files in queue +// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files +// 5- Make sure that all created files were completely deleted +TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DeleteSchedulerTest::RateLimitingMultiThreaded:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + + std::vector<uint64_t> penalties; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); }); + + int thread_cnt = 10; + int num_files = 10; // 10 files per thread + uint64_t file_size = 1024; // every file is 1 kb + + std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25}; + for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) { + penalties.clear(); + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndCreateDir(dummy_files_dirs_[0]); + rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; + NewDeleteScheduler(); + + // Create 100 dummy files, every file is 1 Kb + std::vector<std::string> generated_files; + for (int i = 0; i < num_files * thread_cnt; i++) { + std::string file_name = "file" + ToString(i) + ".data"; + generated_files.push_back(NewDummyFile(file_name, file_size)); + } + + // Delete dummy files using 10 threads and measure time spent to empty trash + std::atomic<int> thread_num(0); + std::vector<port::Thread> threads; + std::function<void()> delete_thread = [&]() { + int idx = thread_num.fetch_add(1); + int range_start = idx * num_files; + int range_end = range_start + num_files; + for (int j = range_start; j < range_end; j++) { + ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j], "")); + } + }; + + for (int i = 0; i < thread_cnt; i++) { + threads.emplace_back(delete_thread); + } + + for (size_t i = 0; i < threads.size(); i++) { + threads[i].join(); + } + + uint64_t delete_start_time = env_->NowMicros(); + TEST_SYNC_POINT("DeleteSchedulerTest::RateLimitingMultiThreaded:1"); + delete_scheduler_->WaitForEmptyTrash(); + uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + + uint64_t total_files_size = 0; + uint64_t expected_penlty = 0; + ASSERT_EQ(penalties.size(), num_files * thread_cnt); + for (int i = 0; i < num_files * thread_cnt; i++) { + total_files_size += file_size; + expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_); + ASSERT_EQ(expected_penlty, penalties[i]); + } + ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); + + ASSERT_EQ(CountNormalFiles(), 0); + ASSERT_EQ(CountTrashFiles(), 0); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +// Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure +// that when DeleteScheduler delete a file it delete it immediately and dont +// move it to trash +TEST_F(DeleteSchedulerTest, DisableRateLimiting) { + int bg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 0; + NewDeleteScheduler(); + + for (int i = 0; i < 10; i++) { + // Every file we delete will be deleted immediately + std::string dummy_file = NewDummyFile("dummy.data"); + ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, "")); + ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound()); + ASSERT_EQ(CountNormalFiles(), 0); + ASSERT_EQ(CountTrashFiles(), 0); + } + + ASSERT_EQ(bg_delete_file, 0); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Testing that moving files to trash with the same name is not a problem +// 1- Create 10 files with the same name "conflict.data" +// 2- Delete the 10 files using DeleteScheduler +// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10) +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- +// 4- Make sure that files are deleted from trash +TEST_F(DeleteSchedulerTest, ConflictNames) { + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DeleteSchedulerTest::ConflictNames:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec + NewDeleteScheduler(); + + // Create "conflict.data" and move it to trash 10 times + for (int i = 0; i < 10; i++) { + std::string dummy_file = NewDummyFile("conflict.data"); + ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, "")); + } + ASSERT_EQ(CountNormalFiles(), 0); + // 10 files ("conflict.data" x 10) in trash + ASSERT_EQ(CountTrashFiles(), 10); + + // Hold BackgroundEmptyTrash + TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1"); + delete_scheduler_->WaitForEmptyTrash(); + ASSERT_EQ(CountTrashFiles(), 0); + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +// 1- Create 10 dummy files +// 2- Delete the 10 files using DeleteScheduler (move them to trsah) +// 3- Delete the 10 files directly (using env_->DeleteFile) +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- +// 4- Make sure that DeleteScheduler failed to delete the 10 files and +// reported 10 background errors +TEST_F(DeleteSchedulerTest, BackgroundError) { + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DeleteSchedulerTest::BackgroundError:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec + NewDeleteScheduler(); + + // Generate 10 dummy files and move them to trash + for (int i = 0; i < 10; i++) { + std::string file_name = "data_" + ToString(i) + ".data"; + ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), "")); + } + ASSERT_EQ(CountNormalFiles(), 0); + ASSERT_EQ(CountTrashFiles(), 10); + + // Delete 10 files from trash, this will cause background errors in + // BackgroundEmptyTrash since we already deleted the files it was + // goind to delete + for (int i = 0; i < 10; i++) { + std::string file_name = "data_" + ToString(i) + ".data.trash"; + ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name)); + } + + // Hold BackgroundEmptyTrash + TEST_SYNC_POINT("DeleteSchedulerTest::BackgroundError:1"); + delete_scheduler_->WaitForEmptyTrash(); + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 10); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +// 1- Create 10 dummy files +// 2- Delete 10 dummy files using DeleteScheduler +// 3- Wait for DeleteScheduler to delete all files in queue +// 4- Make sure all files in trash directory were deleted +// 5- Repeat previous steps 5 times +TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) { + int bg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec + NewDeleteScheduler(); + + // Move files to trash, wait for empty trash, start again + for (int run = 1; run <= 5; run++) { + // Generate 10 dummy files and move them to trash + for (int i = 0; i < 10; i++) { + std::string file_name = "data_" + ToString(i) + ".data"; + ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), "")); + } + ASSERT_EQ(CountNormalFiles(), 0); + delete_scheduler_->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 10 * run); + ASSERT_EQ(CountTrashFiles(), 0); + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + } + + ASSERT_EQ(bg_delete_file, 50); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +} + +TEST_F(DeleteSchedulerTest, DeletePartialFile) { + int bg_delete_file = 0; + int bg_fsync = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void*) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec + NewDeleteScheduler(); + + // Should delete in 4 batch + ASSERT_OK( + delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024), "")); + ASSERT_OK( + delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024), "")); + // Should delete in 2 batch + ASSERT_OK( + delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024), "")); + + delete_scheduler_->WaitForEmptyTrash(); + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + ASSERT_EQ(7, bg_delete_file); + ASSERT_EQ(4, bg_fsync); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +} + +#ifdef OS_LINUX +TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) { + int bg_delete_file = 0; + int bg_fsync = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void*) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec + NewDeleteScheduler(); + + std::string file1 = NewDummyFile("data_1", 500 * 1024); + std::string file2 = NewDummyFile("data_2", 100 * 1024); + + ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b")); + ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b")); + + // Should delete in 4 batch if there is no hardlink + ASSERT_OK(delete_scheduler_->DeleteFile(file1, "")); + ASSERT_OK(delete_scheduler_->DeleteFile(file2, "")); + + delete_scheduler_->WaitForEmptyTrash(); + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + ASSERT_EQ(2, bg_delete_file); + ASSERT_EQ(0, bg_fsync); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +} +#endif + +// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec) +// 2- Delete 100 files using DeleteScheduler +// 3- Delete the DeleteScheduler (call the destructor while queue is not empty) +// 4- Make sure that not all files were deleted from trash and that +// DeleteScheduler background thread did not delete all files +TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) { + int bg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1; // 1 Byte / sec + NewDeleteScheduler(); + + for (int i = 0; i < 100; i++) { + std::string file_name = "data_" + ToString(i) + ".data"; + ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), "")); + } + + // Deleting 100 files will need >28 hours to delete + // we will delete the DeleteScheduler while delete queue is not empty + sst_file_mgr_.reset(); + + ASSERT_LT(bg_delete_file, 100); + ASSERT_GT(CountTrashFiles(), 0); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) { + std::vector<uint64_t> penalties; + int bg_delete_file = 0; + int fg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", + [&](void* /*arg*/) { fg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); }); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"DeleteSchedulerTest::DynamicRateLimiting1:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 0; // Disable rate limiting initially + NewDeleteScheduler(); + + + int num_files = 10; // 10 files + uint64_t file_size = 1024; // every file is 1 kb + + std::vector<int64_t> delete_kbs_per_sec = {512, 200, 0, 100, 50, -2, 25}; + for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) { + penalties.clear(); + bg_delete_file = 0; + fg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndCreateDir(dummy_files_dirs_[0]); + rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; + delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_); + + // Create 100 dummy files, every file is 1 Kb + std::vector<std::string> generated_files; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file" + ToString(i) + ".data"; + generated_files.push_back(NewDummyFile(file_name, file_size)); + } + + // Delete dummy files and measure time spent to empty trash + for (int i = 0; i < num_files; i++) { + ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], "")); + } + ASSERT_EQ(CountNormalFiles(), 0); + + if (rate_bytes_per_sec_ > 0) { + uint64_t delete_start_time = env_->NowMicros(); + TEST_SYNC_POINT("DeleteSchedulerTest::DynamicRateLimiting1:1"); + delete_scheduler_->WaitForEmptyTrash(); + uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + + uint64_t total_files_size = 0; + uint64_t expected_penlty = 0; + ASSERT_EQ(penalties.size(), num_files); + for (int i = 0; i < num_files; i++) { + total_files_size += file_size; + expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_); + ASSERT_EQ(expected_penlty, penalties[i]); + } + ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); + ASSERT_EQ(bg_delete_file, num_files); + ASSERT_EQ(fg_delete_file, 0); + } else { + ASSERT_EQ(penalties.size(), 0); + ASSERT_EQ(bg_delete_file, 0); + ASSERT_EQ(fg_delete_file, num_files); + } + + ASSERT_EQ(CountTrashFiles(), 0); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) { + int bg_delete_file = 0; + int fg_delete_file = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + int num_files = 100; // 100 files + uint64_t file_size = 1024 * 10; // 100 KB as a file size + rate_bytes_per_sec_ = 1; // 1 byte per sec (very slow trash delete) + + NewDeleteScheduler(); + delete_scheduler_->SetMaxTrashDBRatio(0.25); + + std::vector<std::string> generated_files; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file" + ToString(i) + ".data"; + generated_files.push_back(NewDummyFile(file_name, file_size)); + } + + for (std::string& file_name : generated_files) { + delete_scheduler_->DeleteFile(file_name, ""); + } + + // When we end up with 26 files in trash we will start + // deleting new files immediately + ASSERT_EQ(fg_delete_file, 74); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DeleteSchedulerTest, IsTrashCheck) { + // Trash files + ASSERT_TRUE(DeleteScheduler::IsTrashFile("x.trash")); + ASSERT_TRUE(DeleteScheduler::IsTrashFile(".trash")); + ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.sst.trash")); + ASSERT_TRUE(DeleteScheduler::IsTrashFile("/a/b/c/abc..sst.trash")); + ASSERT_TRUE(DeleteScheduler::IsTrashFile("log.trash")); + ASSERT_TRUE(DeleteScheduler::IsTrashFile("^^^^^.log.trash")); + ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.t.trash")); + + // Not trash files + ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.sst")); + ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.txt")); + ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sst")); + ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sstrash")); + ASSERT_FALSE(DeleteScheduler::IsTrashFile("^^^^^.trashh")); + ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.ttrash")); + ASSERT_FALSE(DeleteScheduler::IsTrashFile(".ttrash")); + ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +int main(int /*argc*/, char** /*argv*/) { + printf("DeleteScheduler is not supported in ROCKSDB_LITE\n"); + return 0; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/util/duplicate_detector.h b/src/rocksdb/util/duplicate_detector.h new file mode 100644 index 00000000..40a1cbd1 --- /dev/null +++ b/src/rocksdb/util/duplicate_detector.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> + +#include "util/set_comparator.h" + +namespace rocksdb { +// During recovery if the memtable is flushed we cannot rely on its help on +// duplicate key detection and as key insert will not be attempted. This class +// will be used as a emulator of memtable to tell if insertion of a key/seq +// would have resulted in duplication. +class DuplicateDetector { + public: + explicit DuplicateDetector(DBImpl* db) : db_(db) {} + bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) { + assert(seq >= batch_seq_); + if (batch_seq_ != seq) { // it is a new batch + keys_.clear(); + } + batch_seq_ = seq; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + InitWithComp(cf); + } + auto it = cf_keys.insert(key); + if (it.second == false) { // second is false if a element already existed. + keys_.clear(); + InitWithComp(cf); + keys_[cf].insert(key); + return true; + } + return false; + } + + private: + SequenceNumber batch_seq_ = 0; + DBImpl* db_; + using CFKeys = std::set<Slice, SetComparator>; + std::map<uint32_t, CFKeys> keys_; + void InitWithComp(const uint32_t cf) { + auto h = db_->GetColumnFamilyHandle(cf); + if (!h) { + // TODO(myabandeh): This is not a concern in MyRocks as drop cf is not + // implemented yet. When it does, we should return proper error instead + // of throwing exception. + ROCKS_LOG_FATAL( + db_->immutable_db_options().info_log, + "Recovering an entry from the dropped column family %" PRIu32 + ". WAL must must have been emptied before dropping the column " + "family", cf); +#ifndef ROCKSDB_LITE + throw std::runtime_error( + "Recovering an entry from a dropped column family. " + "WAL must must have been flushed before dropping the column " + "family"); +#endif + return; + } + auto cmp = h->GetComparator(); + keys_[cf] = CFKeys(SetComparator(cmp)); + } +}; +} // namespace rocksdb diff --git a/src/rocksdb/util/dynamic_bloom.cc b/src/rocksdb/util/dynamic_bloom.cc new file mode 100644 index 00000000..8e90efd8 --- /dev/null +++ b/src/rocksdb/util/dynamic_bloom.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "dynamic_bloom.h" + +#include <algorithm> + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/allocator.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { + +uint32_t GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_blocks = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_blocks an odd number to make sure more bits are involved + // when determining which block. + if (num_blocks % 2 == 0) { + num_blocks++; + } + + return num_blocks * (CACHE_LINE_SIZE * 8); +} +} + +DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits, + uint32_t locality, uint32_t num_probes, + size_t huge_page_tlb_size, Logger* logger) + : DynamicBloom(num_probes) { + SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger); +} + +DynamicBloom::DynamicBloom(uint32_t num_probes) + : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {} + +void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits, + uint32_t num_blocks) { + data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw_data); + kTotalBits = total_bits; + kNumBlocks = num_blocks; +} + +void DynamicBloom::SetTotalBits(Allocator* allocator, + uint32_t total_bits, uint32_t locality, + size_t huge_page_tlb_size, + Logger* logger) { + kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) + : (total_bits + 7) / 8 * 8; + kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; + + assert(kNumBlocks > 0 || kTotalBits > 0); + assert(kNumProbes > 0); + + uint32_t sz = kTotalBits / 8; + if (kNumBlocks > 0) { + sz += CACHE_LINE_SIZE - 1; + } + assert(allocator); + + char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); + memset(raw, 0, sz); + auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE; + if (kNumBlocks > 0 && cache_line_offset > 0) { + raw += CACHE_LINE_SIZE - cache_line_offset; + } + data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw); +} + +} // rocksdb diff --git a/src/rocksdb/util/dynamic_bloom.h b/src/rocksdb/util/dynamic_bloom.h new file mode 100644 index 00000000..654bc9ad --- /dev/null +++ b/src/rocksdb/util/dynamic_bloom.h @@ -0,0 +1,197 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> + +#include "rocksdb/slice.h" + +#include "port/port.h" +#include "util/hash.h" + +#include <atomic> +#include <memory> + +namespace rocksdb { + +class Slice; +class Allocator; +class Logger; + +class DynamicBloom { + public: + // allocator: pass allocator to bloom filter, hence trace the usage of memory + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // locality: If positive, optimize for cache line locality, 0 otherwise. + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // within this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit DynamicBloom(Allocator* allocator, + uint32_t total_bits, uint32_t locality = 0, + uint32_t num_probes = 6, + size_t huge_page_tlb_size = 0, + Logger* logger = nullptr); + + explicit DynamicBloom(uint32_t num_probes = 6); + + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger); + + ~DynamicBloom() {} + + // Assuming single threaded access to this function. + void Add(const Slice& key); + + // Like Add, but may be called concurrent with other functions. + void AddConcurrently(const Slice& key); + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Like AddHash, but may be called concurrent with other functions. + void AddHashConcurrently(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContain(const Slice& key) const; + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash) const; + + void Prefetch(uint32_t h); + + uint32_t GetNumBlocks() const { return kNumBlocks; } + + Slice GetRawData() const { + return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8); + } + + void SetRawData(unsigned char* raw_data, uint32_t total_bits, + uint32_t num_blocks = 0); + + uint32_t GetTotalBits() const { return kTotalBits; } + + bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; } + + private: + uint32_t kTotalBits; + uint32_t kNumBlocks; + const uint32_t kNumProbes; + + std::atomic<uint8_t>* data_; + + // or_func(ptr, mask) should effect *ptr |= mask with the appropriate + // concurrency safety, working with bytes. + template <typename OrFunc> + void AddHash(uint32_t hash, const OrFunc& or_func); +}; + +inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); } + +inline void DynamicBloom::AddConcurrently(const Slice& key) { + AddHashConcurrently(BloomHash(key)); +} + +inline void DynamicBloom::AddHash(uint32_t hash) { + AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) { + ptr->store(ptr->load(std::memory_order_relaxed) | mask, + std::memory_order_relaxed); + }); +} + +inline void DynamicBloom::AddHashConcurrently(uint32_t hash) { + AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) { + // Happens-before between AddHash and MaybeContains is handled by + // access to versions_->LastSequence(), so all we have to do here is + // avoid races (so we don't give the compiler a license to mess up + // our code) and not lose bits. std::memory_order_relaxed is enough + // for that. + if ((mask & ptr->load(std::memory_order_relaxed)) != mask) { + ptr->fetch_or(mask, std::memory_order_relaxed); + } + }); +} + +inline bool DynamicBloom::MayContain(const Slice& key) const { + return (MayContainHash(BloomHash(key))); +} + +#if defined(_MSC_VER) +#pragma warning(push) +// local variable is initialized but not referenced +#pragma warning(disable : 4189) +#endif +inline void DynamicBloom::Prefetch(uint32_t h) { + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + PREFETCH(&(data_[b / 8]), 0, 3); + } +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +inline bool DynamicBloom::MayContainHash(uint32_t h) const { + assert(IsInitialized()); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + for (uint32_t i = 0; i < kNumProbes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed); + if ((byteval & (1 << (bitpos % 8))) == 0) { + return false; + } + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed); + if ((byteval & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + } + return true; +} + +template <typename OrFunc> +inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) { + assert(IsInitialized()); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kNumBlocks != 0) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); + for (uint32_t i = 0; i < kNumProbes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + or_func(&data_[bitpos / 8], (1 << (bitpos % 8))); + // Rotate h so that we don't reuse the same bytes. + h = h / (CACHE_LINE_SIZE * 8) + + (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE); + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + or_func(&data_[bitpos / 8], (1 << (bitpos % 8))); + h += delta; + } + } +} + +} // rocksdb diff --git a/src/rocksdb/util/dynamic_bloom_test.cc b/src/rocksdb/util/dynamic_bloom_test.cc new file mode 100644 index 00000000..4244bff1 --- /dev/null +++ b/src/rocksdb/util/dynamic_bloom_test.cc @@ -0,0 +1,340 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run this test... Skipping...\n"); + return 0; +} +#else + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> +#include <algorithm> +#include <atomic> +#include <functional> +#include <memory> +#include <thread> +#include <vector> + +#include "dynamic_bloom.h" +#include "port/port.h" +#include "util/arena.h" +#include "util/gflags_compat.h" +#include "util/logging.h" +#include "util/stop_watch.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_int32(bits_per_key, 10, ""); +DEFINE_int32(num_probes, 6, ""); +DEFINE_bool(enable_perf, false, ""); + +namespace rocksdb { + +static Slice Key(uint64_t i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class DynamicBloomTest : public testing::Test {}; + +TEST_F(DynamicBloomTest, EmptyFilter) { + Arena arena; + DynamicBloom bloom1(&arena, 100, 0, 2); + ASSERT_TRUE(!bloom1.MayContain("hello")); + ASSERT_TRUE(!bloom1.MayContain("world")); + + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + ASSERT_TRUE(!bloom2.MayContain("hello")); + ASSERT_TRUE(!bloom2.MayContain("world")); +} + +TEST_F(DynamicBloomTest, Small) { + Arena arena; + DynamicBloom bloom1(&arena, 100, 0, 2); + bloom1.Add("hello"); + bloom1.Add("world"); + ASSERT_TRUE(bloom1.MayContain("hello")); + ASSERT_TRUE(bloom1.MayContain("world")); + ASSERT_TRUE(!bloom1.MayContain("x")); + ASSERT_TRUE(!bloom1.MayContain("foo")); + + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + bloom2.Add("hello"); + bloom2.Add("world"); + ASSERT_TRUE(bloom2.MayContain("hello")); + ASSERT_TRUE(bloom2.MayContain("world")); + ASSERT_TRUE(!bloom2.MayContain("x")); + ASSERT_TRUE(!bloom2.MayContain("foo")); +} + +TEST_F(DynamicBloomTest, SmallConcurrentAdd) { + Arena arena; + DynamicBloom bloom1(&arena, 100, 0, 2); + bloom1.AddConcurrently("hello"); + bloom1.AddConcurrently("world"); + ASSERT_TRUE(bloom1.MayContain("hello")); + ASSERT_TRUE(bloom1.MayContain("world")); + ASSERT_TRUE(!bloom1.MayContain("x")); + ASSERT_TRUE(!bloom1.MayContain("foo")); + + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + bloom2.AddConcurrently("hello"); + bloom2.AddConcurrently("world"); + ASSERT_TRUE(bloom2.MayContain("hello")); + ASSERT_TRUE(bloom2.MayContain("world")); + ASSERT_TRUE(!bloom2.MayContain("x")); + ASSERT_TRUE(!bloom2.MayContain("foo")); +} + +static uint32_t NextNum(uint32_t num) { + if (num < 10) { + num += 1; + } else if (num < 100) { + num += 10; + } else if (num < 1000) { + num += 100; + } else { + num += 1000; + } + return num; +} + +TEST_F(DynamicBloomTest, VaryingLengths) { + char buffer[sizeof(uint64_t)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes); + + fprintf(stderr, "bits_per_key: %d num_probes: %d\n", FLAGS_bits_per_key, + num_probes); + + for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) { + for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { + uint32_t bloom_bits = 0; + Arena arena; + if (enable_locality == 0) { + bloom_bits = std::max(num * FLAGS_bits_per_key, 64U); + } else { + bloom_bits = std::max(num * FLAGS_bits_per_key, + enable_locality * CACHE_LINE_SIZE * 8); + } + DynamicBloom bloom(&arena, bloom_bits, enable_locality, num_probes); + for (uint64_t i = 0; i < num; i++) { + bloom.Add(Key(i, buffer)); + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); + } + + // All added keys must match + for (uint64_t i = 0; i < num; i++) { + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num + << "; key " << i; + } + + // Check false positive rate + + int result = 0; + for (uint64_t i = 0; i < 10000; i++) { + if (bloom.MayContain(Key(i + 1000000000, buffer))) { + result++; + } + } + double rate = result / 10000.0; + + fprintf(stderr, + "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, " + "enable locality?%u\n", + rate * 100.0, num, bloom_bits, enable_locality); + + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + + fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters, + mediocre_filters); + ASSERT_LE(mediocre_filters, good_filters / 5); + } +} + +TEST_F(DynamicBloomTest, perf) { + StopWatchNano timer(Env::Default()); + uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes); + + if (!FLAGS_enable_perf) { + return; + } + + for (uint32_t m = 1; m <= 8; ++m) { + Arena arena; + const uint32_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8); + + DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes); + + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8)); + } + + uint64_t elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n", + elapsed / num_keys); + + uint32_t count = 0; + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) { + ++count; + } + } + ASSERT_EQ(count, num_keys); + elapsed = timer.ElapsedNanos(); + assert(count > 0); + fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n", + elapsed / count); + + // Locality enabled version + DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes); + + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8)); + } + + elapsed = timer.ElapsedNanos(); + fprintf(stderr, + "blocked bloom(enable locality), avg add latency %" PRIu64 "\n", + elapsed / num_keys); + + count = 0; + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + if (blocked_bloom.MayContain( + Slice(reinterpret_cast<const char*>(&i), 8))) { + ++count; + } + } + + elapsed = timer.ElapsedNanos(); + assert(count > 0); + fprintf(stderr, + "blocked bloom(enable locality), avg query latency %" PRIu64 "\n", + elapsed / count); + ASSERT_TRUE(count == num_keys); + } +} + +TEST_F(DynamicBloomTest, concurrent_with_perf) { + StopWatchNano timer(Env::Default()); + uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes); + + uint32_t m_limit = FLAGS_enable_perf ? 8 : 1; + uint32_t locality_limit = FLAGS_enable_perf ? 1 : 0; + + uint32_t num_threads = 4; + std::vector<port::Thread> threads; + + for (uint32_t m = 1; m <= m_limit; ++m) { + for (uint32_t locality = 0; locality <= locality_limit; ++locality) { + Arena arena; + const uint32_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu32 "M keys with %" PRIu32 " locality\n", + m * 8, locality); + + DynamicBloom std_bloom(&arena, num_keys * 10, locality, num_probes); + + timer.Start(); + + std::function<void(size_t)> adder([&](size_t t) { + for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { + std_bloom.AddConcurrently( + Slice(reinterpret_cast<const char*>(&i), 8)); + } + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(adder, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } + + uint64_t elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg parallel add latency %" PRIu64 + " nanos/key\n", + elapsed / num_keys); + + timer.Start(); + + std::function<void(size_t)> hitter([&](size_t t) { + for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { + bool f = + std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8)); + ASSERT_TRUE(f); + } + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(hitter, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } + + elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg parallel hit latency %" PRIu64 + " nanos/key\n", + elapsed / num_keys); + + timer.Start(); + + std::atomic<uint32_t> false_positives(0); + std::function<void(size_t)> misser([&](size_t t) { + for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; + i += num_threads) { + bool f = + std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8)); + if (f) { + ++false_positives; + } + } + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(misser, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } + + elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg parallel miss latency %" PRIu64 + " nanos/key, %f%% false positive rate\n", + elapsed / num_keys, false_positives.load() * 100.0 / num_keys); + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + + return RUN_ALL_TESTS(); +} + +#endif // GFLAGS diff --git a/src/rocksdb/util/event_logger.cc b/src/rocksdb/util/event_logger.cc new file mode 100644 index 00000000..b488984f --- /dev/null +++ b/src/rocksdb/util/event_logger.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "util/event_logger.h" + +#include <inttypes.h> +#include <cassert> +#include <sstream> +#include <string> + +#include "util/logging.h" +#include "util/string_util.h" + +namespace rocksdb { + + +EventLoggerStream::EventLoggerStream(Logger* logger) + : logger_(logger), log_buffer_(nullptr), json_writer_(nullptr) {} + +EventLoggerStream::EventLoggerStream(LogBuffer* log_buffer) + : logger_(nullptr), log_buffer_(log_buffer), json_writer_(nullptr) {} + +EventLoggerStream::~EventLoggerStream() { + if (json_writer_) { + json_writer_->EndObject(); +#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT + printf("%s\n", json_writer_->Get().c_str()); +#else + if (logger_) { + EventLogger::Log(logger_, *json_writer_); + } else if (log_buffer_) { + EventLogger::LogToBuffer(log_buffer_, *json_writer_); + } +#endif + delete json_writer_; + } +} + +void EventLogger::Log(const JSONWriter& jwriter) { + Log(logger_, jwriter); +} + +void EventLogger::Log(Logger* logger, const JSONWriter& jwriter) { +#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT + printf("%s\n", jwriter.Get().c_str()); +#else + rocksdb::Log(logger, "%s %s", Prefix(), jwriter.Get().c_str()); +#endif +} + +void EventLogger::LogToBuffer( + LogBuffer* log_buffer, const JSONWriter& jwriter) { +#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT + printf("%s\n", jwriter.Get().c_str()); +#else + assert(log_buffer); + rocksdb::LogToBuffer(log_buffer, "%s %s", Prefix(), jwriter.Get().c_str()); +#endif +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/event_logger.h b/src/rocksdb/util/event_logger.h new file mode 100644 index 00000000..d88a6a4f --- /dev/null +++ b/src/rocksdb/util/event_logger.h @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> +#include <sstream> +#include <string> +#include <chrono> + +#include "rocksdb/env.h" +#include "util/log_buffer.h" + +namespace rocksdb { + +class JSONWriter { + public: + JSONWriter() : state_(kExpectKey), first_element_(true), in_array_(false) { + stream_ << "{"; + } + + void AddKey(const std::string& key) { + assert(state_ == kExpectKey); + if (!first_element_) { + stream_ << ", "; + } + stream_ << "\"" << key << "\": "; + state_ = kExpectValue; + first_element_ = false; + } + + void AddValue(const char* value) { + assert(state_ == kExpectValue || state_ == kInArray); + if (state_ == kInArray && !first_element_) { + stream_ << ", "; + } + stream_ << "\"" << value << "\""; + if (state_ != kInArray) { + state_ = kExpectKey; + } + first_element_ = false; + } + + template <typename T> + void AddValue(const T& value) { + assert(state_ == kExpectValue || state_ == kInArray); + if (state_ == kInArray && !first_element_) { + stream_ << ", "; + } + stream_ << value; + if (state_ != kInArray) { + state_ = kExpectKey; + } + first_element_ = false; + } + + void StartArray() { + assert(state_ == kExpectValue); + state_ = kInArray; + in_array_ = true; + stream_ << "["; + first_element_ = true; + } + + void EndArray() { + assert(state_ == kInArray); + state_ = kExpectKey; + in_array_ = false; + stream_ << "]"; + first_element_ = false; + } + + void StartObject() { + assert(state_ == kExpectValue); + state_ = kExpectKey; + stream_ << "{"; + first_element_ = true; + } + + void EndObject() { + assert(state_ == kExpectKey); + stream_ << "}"; + first_element_ = false; + } + + void StartArrayedObject() { + assert(state_ == kInArray && in_array_); + state_ = kExpectValue; + if (!first_element_) { + stream_ << ", "; + } + StartObject(); + } + + void EndArrayedObject() { + assert(in_array_); + EndObject(); + state_ = kInArray; + } + + std::string Get() const { return stream_.str(); } + + JSONWriter& operator<<(const char* val) { + if (state_ == kExpectKey) { + AddKey(val); + } else { + AddValue(val); + } + return *this; + } + + JSONWriter& operator<<(const std::string& val) { + return *this << val.c_str(); + } + + template <typename T> + JSONWriter& operator<<(const T& val) { + assert(state_ != kExpectKey); + AddValue(val); + return *this; + } + + private: + enum JSONWriterState { + kExpectKey, + kExpectValue, + kInArray, + kInArrayedObject, + }; + JSONWriterState state_; + bool first_element_; + bool in_array_; + std::ostringstream stream_; +}; + +class EventLoggerStream { + public: + template <typename T> + EventLoggerStream& operator<<(const T& val) { + MakeStream(); + *json_writer_ << val; + return *this; + } + + void StartArray() { json_writer_->StartArray(); } + void EndArray() { json_writer_->EndArray(); } + void StartObject() { json_writer_->StartObject(); } + void EndObject() { json_writer_->EndObject(); } + + ~EventLoggerStream(); + + private: + void MakeStream() { + if (!json_writer_) { + json_writer_ = new JSONWriter(); + *this << "time_micros" + << std::chrono::duration_cast<std::chrono::microseconds>( + std::chrono::system_clock::now().time_since_epoch()).count(); + } + } + friend class EventLogger; + explicit EventLoggerStream(Logger* logger); + explicit EventLoggerStream(LogBuffer* log_buffer); + // exactly one is non-nullptr + Logger* const logger_; + LogBuffer* const log_buffer_; + // ownership + JSONWriter* json_writer_; +}; + +// here is an example of the output that will show up in the LOG: +// 2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": +// 1421360005788015, "event": "table_file_creation", "file_number": 12, +// "file_size": 1909699} +class EventLogger { + public: + static const char* Prefix() { + return "EVENT_LOG_v1"; + } + + explicit EventLogger(Logger* logger) : logger_(logger) {} + EventLoggerStream Log() { return EventLoggerStream(logger_); } + EventLoggerStream LogToBuffer(LogBuffer* log_buffer) { + return EventLoggerStream(log_buffer); + } + void Log(const JSONWriter& jwriter); + static void Log(Logger* logger, const JSONWriter& jwriter); + static void LogToBuffer(LogBuffer* log_buffer, const JSONWriter& jwriter); + + private: + Logger* logger_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/event_logger_test.cc b/src/rocksdb/util/event_logger_test.cc new file mode 100644 index 00000000..4bcf30ff --- /dev/null +++ b/src/rocksdb/util/event_logger_test.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <string> + +#include "util/event_logger.h" +#include "util/testharness.h" + +namespace rocksdb { + +class EventLoggerTest : public testing::Test {}; + +class StringLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + vsnprintf(buffer_, sizeof(buffer_), format, ap); + } + char* buffer() { return buffer_; } + + private: + char buffer_[1000]; +}; + +TEST_F(EventLoggerTest, SimpleTest) { + StringLogger logger; + EventLogger event_logger(&logger); + event_logger.Log() << "id" << 5 << "event" + << "just_testing"; + std::string output(logger.buffer()); + ASSERT_TRUE(output.find("\"event\": \"just_testing\"") != std::string::npos); + ASSERT_TRUE(output.find("\"id\": 5") != std::string::npos); + ASSERT_TRUE(output.find("\"time_micros\"") != std::string::npos); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/fault_injection_test_env.cc b/src/rocksdb/util/fault_injection_test_env.cc new file mode 100644 index 00000000..9cad2387 --- /dev/null +++ b/src/rocksdb/util/fault_injection_test_env.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#include "util/fault_injection_test_env.h" +#include <functional> +#include <utility> + +namespace rocksdb { + +// Assume a filename, and not a directory name like "/foo/bar/" +std::string GetDirName(const std::string filename) { + size_t found = filename.find_last_of("/\\"); + if (found == std::string::npos) { + return ""; + } else { + return filename.substr(0, found); + } +} + +// A basic file truncation function suitable for this test. +Status Truncate(Env* env, const std::string& filename, uint64_t length) { + std::unique_ptr<SequentialFile> orig_file; + const EnvOptions options; + Status s = env->NewSequentialFile(filename, &orig_file, options); + if (!s.ok()) { + fprintf(stderr, "Cannot open file %s for truncation: %s\n", + filename.c_str(), s.ToString().c_str()); + return s; + } + + std::unique_ptr<char[]> scratch(new char[length]); + rocksdb::Slice result; + s = orig_file->Read(length, &result, scratch.get()); +#ifdef OS_WIN + orig_file.reset(); +#endif + if (s.ok()) { + std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; + std::unique_ptr<WritableFile> tmp_file; + s = env->NewWritableFile(tmp_name, &tmp_file, options); + if (s.ok()) { + s = tmp_file->Append(result); + if (s.ok()) { + s = env->RenameFile(tmp_name, filename); + } else { + fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(), + filename.c_str(), s.ToString().c_str()); + env->DeleteFile(tmp_name); + } + } + } + if (!s.ok()) { + fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), + s.ToString().c_str()); + } + + return s; +} + +// Trim the tailing "/" in the end of `str` +std::string TrimDirname(const std::string& str) { + size_t found = str.find_last_not_of("/"); + if (found == std::string::npos) { + return str; + } + return str.substr(0, found + 1); +} + +// Return pair <parent directory name, file name> of a full path. +std::pair<std::string, std::string> GetDirAndName(const std::string& name) { + std::string dirname = GetDirName(name); + std::string fname = name.substr(dirname.size() + 1); + return std::make_pair(dirname, fname); +} + +Status FileState::DropUnsyncedData(Env* env) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + return Truncate(env, filename_, sync_pos); +} + +Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + assert(pos_ >= sync_pos); + int range = static_cast<int>(pos_ - sync_pos); + uint64_t truncated_size = + static_cast<uint64_t>(sync_pos) + rand->Uniform(range); + return Truncate(env, filename_, truncated_size); +} + +Status TestDirectory::Fsync() { + env_->SyncDir(dirname_); + return dir_->Fsync(); +} + +TestWritableFile::TestWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>&& f, + FaultInjectionTestEnv* env) + : state_(fname), + target_(std::move(f)), + writable_file_opened_(true), + env_(env) { + assert(target_ != nullptr); + state_.pos_ = 0; +} + +TestWritableFile::~TestWritableFile() { + if (writable_file_opened_) { + Close(); + } +} + +Status TestWritableFile::Append(const Slice& data) { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + Status s = target_->Append(data); + if (s.ok()) { + state_.pos_ += data.size(); + env_->WritableFileAppended(state_); + } + return s; +} + +Status TestWritableFile::Close() { + writable_file_opened_ = false; + Status s = target_->Close(); + if (s.ok()) { + env_->WritableFileClosed(state_); + } + return s; +} + +Status TestWritableFile::Flush() { + Status s = target_->Flush(); + if (s.ok() && env_->IsFilesystemActive()) { + state_.pos_at_last_flush_ = state_.pos_; + } + return s; +} + +Status TestWritableFile::Sync() { + if (!env_->IsFilesystemActive()) { + return Status::IOError("FaultInjectionTestEnv: not active"); + } + // No need to actual sync. + state_.pos_at_last_sync_ = state_.pos_; + env_->WritableFileSynced(state_); + return Status::OK(); +} + +Status FaultInjectionTestEnv::NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) { + std::unique_ptr<Directory> r; + Status s = target()->NewDirectory(name, &r); + assert(s.ok()); + if (!s.ok()) { + return s; + } + result->reset(new TestDirectory(this, TrimDirname(name), r.release())); + return Status::OK(); +} + +Status FaultInjectionTestEnv::NewWritableFile( + const std::string& fname, std::unique_ptr<WritableFile>* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + // Not allow overwriting files + Status s = target()->FileExists(fname); + if (s.ok()) { + return Status::Corruption("File already exists."); + } else if (!s.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + s = target()->NewWritableFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + +Status FaultInjectionTestEnv::ReopenWritableFile( + const std::string& fname, std::unique_ptr<WritableFile>* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = target()->ReopenWritableFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + +Status FaultInjectionTestEnv::NewRandomAccessFile( + const std::string& fname, std::unique_ptr<RandomAccessFile>* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + return target()->NewRandomAccessFile(fname, result, soptions); +} + +Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = EnvWrapper::DeleteFile(f); + if (!s.ok()) { + fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(), + s.ToString().c_str()); + } + assert(s.ok()); + if (s.ok()) { + UntrackFile(f); + } + return s; +} + +Status FaultInjectionTestEnv::RenameFile(const std::string& s, + const std::string& t) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status ret = EnvWrapper::RenameFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + db_file_state_.erase(s); + } + + auto sdn = GetDirAndName(s); + auto tdn = GetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist.insert(tdn.second); + } + } + + return ret; +} + +void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) { + MutexLock l(&mutex_); + if (open_files_.find(state.filename_) != open_files_.end()) { + db_file_state_[state.filename_] = state; + open_files_.erase(state.filename_); + } +} + +void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) { + MutexLock l(&mutex_); + if (open_files_.find(state.filename_) != open_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) { + MutexLock l(&mutex_); + if (open_files_.find(state.filename_) != open_files_.end()) { + if (db_file_state_.find(state.filename_) == db_file_state_.end()) { + db_file_state_.insert(std::make_pair(state.filename_, state)); + } else { + db_file_state_[state.filename_] = state; + } + } +} + +// For every file that is not fully synced, make a call to `func` with +// FileState of the file as the parameter. +Status FaultInjectionTestEnv::DropFileData( + std::function<Status(Env*, FileState)> func) { + Status s; + MutexLock l(&mutex_); + for (std::map<std::string, FileState>::const_iterator it = + db_file_state_.begin(); + s.ok() && it != db_file_state_.end(); ++it) { + const FileState& state = it->second; + if (!state.IsFullySynced()) { + s = func(target(), state); + } + } + return s; +} + +Status FaultInjectionTestEnv::DropUnsyncedFileData() { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropUnsyncedData(env); + }); +} + +Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropRandomUnsyncedData(env, rnd); + }); +} + +Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() { + // Because DeleteFile access this container make a copy to avoid deadlock + std::map<std::string, std::set<std::string>> map_copy; + { + MutexLock l(&mutex_); + map_copy.insert(dir_to_new_files_since_last_sync_.begin(), + dir_to_new_files_since_last_sync_.end()); + } + + for (auto& pair : map_copy) { + for (std::string name : pair.second) { + Status s = DeleteFile(pair.first + "/" + name); + if (!s.ok()) { + return s; + } + } + } + return Status::OK(); +} +void FaultInjectionTestEnv::ResetState() { + MutexLock l(&mutex_); + db_file_state_.clear(); + dir_to_new_files_since_last_sync_.clear(); + SetFilesystemActiveNoLock(true); +} + +void FaultInjectionTestEnv::UntrackFile(const std::string& f) { + MutexLock l(&mutex_); + auto dir_and_name = GetDirAndName(f); + dir_to_new_files_since_last_sync_[dir_and_name.first].erase( + dir_and_name.second); + db_file_state_.erase(f); + open_files_.erase(f); +} +} // namespace rocksdb diff --git a/src/rocksdb/util/fault_injection_test_env.h b/src/rocksdb/util/fault_injection_test_env.h new file mode 100644 index 00000000..7c5a080f --- /dev/null +++ b/src/rocksdb/util/fault_injection_test_env.h @@ -0,0 +1,194 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#pragma once + +#include <map> +#include <set> +#include <string> + +#include "db/version_set.h" +#include "env/mock_env.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/filename.h" +#include "util/mutexlock.h" +#include "util/random.h" + +namespace rocksdb { + +class TestWritableFile; +class FaultInjectionTestEnv; + +struct FileState { + std::string filename_; + ssize_t pos_; + ssize_t pos_at_last_sync_; + ssize_t pos_at_last_flush_; + + explicit FileState(const std::string& filename) + : filename_(filename), + pos_(-1), + pos_at_last_sync_(-1), + pos_at_last_flush_(-1) {} + + FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} + + bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } + + Status DropUnsyncedData(Env* env) const; + + Status DropRandomUnsyncedData(Env* env, Random* rand) const; +}; + +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestWritableFile : public WritableFile { + public: + explicit TestWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>&& f, + FaultInjectionTestEnv* env); + virtual ~TestWritableFile(); + virtual Status Append(const Slice& data) override; + virtual Status Truncate(uint64_t size) override { + return target_->Truncate(size); + } + virtual Status Close() override; + virtual Status Flush() override; + virtual Status Sync() override; + virtual bool IsSyncThreadSafe() const override { return true; } + virtual Status PositionedAppend(const Slice& data, + uint64_t offset) override { + return target_->PositionedAppend(data, offset); + } + virtual bool use_direct_io() const override { + return target_->use_direct_io(); + }; + + private: + FileState state_; + std::unique_ptr<WritableFile> target_; + bool writable_file_opened_; + FaultInjectionTestEnv* env_; +}; + +class TestDirectory : public Directory { + public: + explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, + Directory* dir) + : env_(env), dirname_(dirname), dir_(dir) {} + ~TestDirectory() {} + + virtual Status Fsync() override; + + private: + FaultInjectionTestEnv* env_; + std::string dirname_; + std::unique_ptr<Directory> dir_; +}; + +class FaultInjectionTestEnv : public EnvWrapper { + public: + explicit FaultInjectionTestEnv(Env* base) + : EnvWrapper(base), filesystem_active_(true) {} + virtual ~FaultInjectionTestEnv() {} + + Status NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) override; + + Status NewWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& soptions) override; + + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& soptions) override; + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr<RandomAccessFile>* result, + const EnvOptions& soptions) override; + + virtual Status DeleteFile(const std::string& f) override; + + virtual Status RenameFile(const std::string& s, + const std::string& t) override; + + virtual Status GetFreeSpace(const std::string& path, + uint64_t* disk_free) override { + if (!IsFilesystemActive() && error_ == Status::NoSpace()) { + *disk_free = 0; + return Status::OK(); + } else { + return target()->GetFreeSpace(path, disk_free); + } + } + + void WritableFileClosed(const FileState& state); + + void WritableFileSynced(const FileState& state); + + void WritableFileAppended(const FileState& state); + + // For every file that is not fully synced, make a call to `func` with + // FileState of the file as the parameter. + Status DropFileData(std::function<Status(Env*, FileState)> func); + + Status DropUnsyncedFileData(); + + Status DropRandomUnsyncedFileData(Random* rnd); + + Status DeleteFilesCreatedAfterLastDirSync(); + + void ResetState(); + + void UntrackFile(const std::string& f); + + void SyncDir(const std::string& dirname) { + MutexLock l(&mutex_); + dir_to_new_files_since_last_sync_.erase(dirname); + } + + // Setting the filesystem to inactive is the test equivalent to simulating a + // system reset. Setting to inactive will freeze our saved filesystem state so + // that it will stop being recorded. It can then be reset back to the state at + // the time of the reset. + bool IsFilesystemActive() { + MutexLock l(&mutex_); + return filesystem_active_; + } + void SetFilesystemActiveNoLock(bool active, + Status error = Status::Corruption("Not active")) { + filesystem_active_ = active; + if (!active) { + error_ = error; + } + } + void SetFilesystemActive(bool active, + Status error = Status::Corruption("Not active")) { + MutexLock l(&mutex_); + SetFilesystemActiveNoLock(active, error); + } + void AssertNoOpenFile() { assert(open_files_.empty()); } + Status GetError() { return error_; } + + private: + port::Mutex mutex_; + std::map<std::string, FileState> db_file_state_; + std::set<std::string> open_files_; + std::unordered_map<std::string, std::set<std::string>> + dir_to_new_files_since_last_sync_; + bool filesystem_active_; // Record flushes, syncs, writes + Status error_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/file_reader_writer.cc b/src/rocksdb/util/file_reader_writer.cc new file mode 100644 index 00000000..9a818cb0 --- /dev/null +++ b/src/rocksdb/util/file_reader_writer.cc @@ -0,0 +1,867 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/file_reader_writer.h" + +#include <algorithm> +#include <mutex> + +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "util/random.h" +#include "util/rate_limiter.h" +#include "util/sync_point.h" + +namespace rocksdb { + +#ifndef NDEBUG +namespace { +bool IsFileSectorAligned(const size_t off, size_t sector_size) { + return off % sector_size == 0; +} +} +#endif + +Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { + Status s; + if (use_direct_io()) { +#ifndef ROCKSDB_LITE + size_t offset = offset_.fetch_add(n); + size_t alignment = file_->GetRequiredBufferAlignment(); + size_t aligned_offset = TruncateToPageBoundary(alignment, offset); + size_t offset_advance = offset - aligned_offset; + size_t size = Roundup(offset + n, alignment) - aligned_offset; + size_t r = 0; + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(size); + Slice tmp; + s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart()); + if (s.ok() && offset_advance < tmp.size()) { + buf.Size(tmp.size()); + r = buf.Read(scratch, offset_advance, + std::min(tmp.size() - offset_advance, n)); + } + *result = Slice(scratch, r); +#endif // !ROCKSDB_LITE + } else { + s = file_->Read(n, result, scratch); + } + IOSTATS_ADD(bytes_read, result->size()); + return s; +} + + +Status SequentialFileReader::Skip(uint64_t n) { +#ifndef ROCKSDB_LITE + if (use_direct_io()) { + offset_ += static_cast<size_t>(n); + return Status::OK(); + } +#endif // !ROCKSDB_LITE + return file_->Skip(n); +} + +Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + uint64_t elapsed = 0; + { + StopWatch sw(env_, stats_, hist_type_, + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_TIMER_GUARD(read_nanos); + if (use_direct_io()) { +#ifndef ROCKSDB_LITE + size_t alignment = file_->GetRequiredBufferAlignment(); + size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast<size_t>(offset)); + size_t offset_advance = static_cast<size_t>(offset) - aligned_offset; + size_t read_size = Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset; + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(read_size); + while (buf.CurrentSize() < read_size) { + size_t allowed; + if (for_compaction_ && rate_limiter_ != nullptr) { + allowed = rate_limiter_->RequestToken( + buf.Capacity() - buf.CurrentSize(), buf.Alignment(), + Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead); + } else { + assert(buf.CurrentSize() == 0); + allowed = read_size; + } + Slice tmp; + + FileOperationInfo::TimePoint start_ts; + uint64_t orig_offset = 0; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + orig_offset = aligned_offset + buf.CurrentSize(); + } + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp, + buf.Destination()); + } + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, + s); + } + + buf.Size(buf.CurrentSize() + tmp.size()); + if (!s.ok() || tmp.size() < allowed) { + break; + } + } + size_t res_len = 0; + if (s.ok() && offset_advance < buf.CurrentSize()) { + res_len = buf.Read(scratch, offset_advance, + std::min(buf.CurrentSize() - offset_advance, n)); + } + *result = Slice(scratch, res_len); +#endif // !ROCKSDB_LITE + } else { + size_t pos = 0; + const char* res_scratch = nullptr; + while (pos < n) { + size_t allowed; + if (for_compaction_ && rate_limiter_ != nullptr) { + if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { + sw.DelayStart(); + } + allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */, + Env::IOPriority::IO_LOW, stats_, + RateLimiter::OpType::kRead); + if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { + sw.DelayStop(); + } + } else { + allowed = n; + } + Slice tmp_result; + +#ifndef ROCKSDB_LITE + FileOperationInfo::TimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + } +#endif + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos); + } +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, + finish_ts, s); + } +#endif + + if (res_scratch == nullptr) { + // we can't simply use `scratch` because reads of mmap'd files return + // data in a different buffer. + res_scratch = tmp_result.data(); + } else { + // make sure chunks are inserted contiguously into `res_scratch`. + assert(tmp_result.data() == res_scratch + pos); + } + pos += tmp_result.size(); + if (!s.ok() || tmp_result.size() < allowed) { + break; + } + } + *result = Slice(res_scratch, s.ok() ? pos : 0); + } + IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); + SetPerfLevel(prev_perf_level); + } + if (stats_ != nullptr && file_read_hist_ != nullptr) { + file_read_hist_->Add(elapsed); + } + + return s; +} + +Status WritableFileWriter::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + + TEST_KILL_RANDOM("WritableFileWriter::Append:0", + rocksdb_kill_odds * REDUCE_ODDS2); + + { + IOSTATS_TIMER_GUARD(prepare_write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite"); + writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left); + } + + // See whether we need to enlarge the buffer to avoid the flush + if (buf_.Capacity() - buf_.CurrentSize() < left) { + for (size_t cap = buf_.Capacity(); + cap < max_buffer_size_; // There is still room to increase + cap *= 2) { + // See whether the next available size is large enough. + // Buffer will never be increased to more than max_buffer_size_. + size_t desired_capacity = std::min(cap * 2, max_buffer_size_); + if (desired_capacity - buf_.CurrentSize() >= left || + (use_direct_io() && desired_capacity == max_buffer_size_)) { + buf_.AllocateNewBuffer(desired_capacity, true); + break; + } + } + } + + // Flush only when buffered I/O + if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) { + if (buf_.CurrentSize() > 0) { + s = Flush(); + if (!s.ok()) { + return s; + } + } + assert(buf_.CurrentSize() == 0); + } + + // We never write directly to disk with direct I/O on. + // or we simply use it for its original purpose to accumulate many small + // chunks + if (use_direct_io() || (buf_.Capacity() >= left)) { + while (left > 0) { + size_t appended = buf_.Append(src, left); + left -= appended; + src += appended; + + if (left > 0) { + s = Flush(); + if (!s.ok()) { + break; + } + } + } + } else { + // Writing directly to file bypassing the buffer + assert(buf_.CurrentSize() == 0); + s = WriteBuffered(src, left); + } + + TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds); + if (s.ok()) { + filesize_ += data.size(); + } + return s; +} + +Status WritableFileWriter::Pad(const size_t pad_bytes) { + assert(pad_bytes < kDefaultPageSize); + size_t left = pad_bytes; + size_t cap = buf_.Capacity() - buf_.CurrentSize(); + + // Assume pad_bytes is small compared to buf_ capacity. So we always + // use buf_ rather than write directly to file in certain cases like + // Append() does. + while (left) { + size_t append_bytes = std::min(cap, left); + buf_.PadWith(append_bytes, 0); + left -= append_bytes; + if (left > 0) { + Status s = Flush(); + if (!s.ok()) { + return s; + } + } + cap = buf_.Capacity() - buf_.CurrentSize(); + } + pending_sync_ = true; + filesize_ += pad_bytes; + return Status::OK(); +} + +Status WritableFileWriter::Close() { + + // Do not quit immediately on failure the file MUST be closed + Status s; + + // Possible to close it twice now as we MUST close + // in __dtor, simply flushing is not enough + // Windows when pre-allocating does not fill with zeros + // also with unbuffered access we also set the end of data. + if (!writable_file_) { + return s; + } + + s = Flush(); // flush cache to OS + + Status interim; + // In direct I/O mode we write whole pages so + // we need to let the file know where data ends. + if (use_direct_io()) { + interim = writable_file_->Truncate(filesize_); + if (interim.ok()) { + interim = writable_file_->Fsync(); + } + if (!interim.ok() && s.ok()) { + s = interim; + } + } + + TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds); + interim = writable_file_->Close(); + if (!interim.ok() && s.ok()) { + s = interim; + } + + writable_file_.reset(); + TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds); + + return s; +} + +// write out the cached data to the OS cache or storage if direct I/O +// enabled +Status WritableFileWriter::Flush() { + Status s; + TEST_KILL_RANDOM("WritableFileWriter::Flush:0", + rocksdb_kill_odds * REDUCE_ODDS2); + + if (buf_.CurrentSize() > 0) { + if (use_direct_io()) { +#ifndef ROCKSDB_LITE + if (pending_sync_) { + s = WriteDirect(); + } +#endif // !ROCKSDB_LITE + } else { + s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize()); + } + if (!s.ok()) { + return s; + } + } + + s = writable_file_->Flush(); + + if (!s.ok()) { + return s; + } + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + + // We try to avoid sync to the last 1MB of data. For two reasons: + // (1) avoid rewrite the same page that is modified later. + // (2) for older version of OS, write can block while writing out + // the page. + // Xfs does neighbor page flushing outside of the specified ranges. We + // need to make sure sync range is far from the write offset. + if (!use_direct_io() && bytes_per_sync_) { + const uint64_t kBytesNotSyncRange = 1024 * 1024; // recent 1MB is not synced. + const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB. + if (filesize_ > kBytesNotSyncRange) { + uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange; + offset_sync_to -= offset_sync_to % kBytesAlignWhenSync; + assert(offset_sync_to >= last_sync_size_); + if (offset_sync_to > 0 && + offset_sync_to - last_sync_size_ >= bytes_per_sync_) { + s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_); + last_sync_size_ = offset_sync_to; + } + } + } + + return s; +} + +Status WritableFileWriter::Sync(bool use_fsync) { + Status s = Flush(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds); + if (!use_direct_io() && pending_sync_) { + s = SyncInternal(use_fsync); + if (!s.ok()) { + return s; + } + } + TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds); + pending_sync_ = false; + return Status::OK(); +} + +Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) { + if (!writable_file_->IsSyncThreadSafe()) { + return Status::NotSupported( + "Can't WritableFileWriter::SyncWithoutFlush() because " + "WritableFile::IsSyncThreadSafe() is false"); + } + TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1"); + Status s = SyncInternal(use_fsync); + TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2"); + return s; +} + +Status WritableFileWriter::SyncInternal(bool use_fsync) { + Status s; + IOSTATS_TIMER_GUARD(fsync_nanos); + TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0"); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); + if (use_fsync) { + s = writable_file_->Fsync(); + } else { + s = writable_file_->Sync(); + } + SetPerfLevel(prev_perf_level); + return s; +} + +Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { + IOSTATS_TIMER_GUARD(range_sync_nanos); + TEST_SYNC_POINT("WritableFileWriter::RangeSync:0"); + return writable_file_->RangeSync(offset, nbytes); +} + +// This method writes to disk the specified data and makes use of the rate +// limiter if available +Status WritableFileWriter::WriteBuffered(const char* data, size_t size) { + Status s; + assert(!use_direct_io()); + const char* src = data; + size_t left = size; + + while (left > 0) { + size_t allowed; + if (rate_limiter_ != nullptr) { + allowed = rate_limiter_->RequestToken( + left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_, + RateLimiter::OpType::kWrite); + } else { + allowed = left; + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + +#ifndef ROCKSDB_LITE + FileOperationInfo::TimePoint start_ts; + uint64_t old_size = writable_file_->GetFileSize(); + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + old_size = next_write_offset_; + } +#endif + { + auto prev_perf_level = GetPerfLevel(); + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); + s = writable_file_->Append(Slice(src, allowed)); + SetPerfLevel(prev_perf_level); + } +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s); + } +#endif + if (!s.ok()) { + return s; + } + } + + IOSTATS_ADD(bytes_written, allowed); + TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds); + + left -= allowed; + src += allowed; + } + buf_.Size(0); + return s; +} + + +// This flushes the accumulated data in the buffer. We pad data with zeros if +// necessary to the whole page. +// However, during automatic flushes padding would not be necessary. +// We always use RateLimiter if available. We move (Refit) any buffer bytes +// that are left over the +// whole number of pages to be written again on the next flush because we can +// only write on aligned +// offsets. +#ifndef ROCKSDB_LITE +Status WritableFileWriter::WriteDirect() { + assert(use_direct_io()); + Status s; + const size_t alignment = buf_.Alignment(); + assert((next_write_offset_ % alignment) == 0); + + // Calculate whole page final file advance if all writes succeed + size_t file_advance = + TruncateToPageBoundary(alignment, buf_.CurrentSize()); + + // Calculate the leftover tail, we write it here padded with zeros BUT we + // will write + // it again in the future either on Close() OR when the current whole page + // fills out + size_t leftover_tail = buf_.CurrentSize() - file_advance; + + // Round up and pad + buf_.PadToAlignmentWith(0); + + const char* src = buf_.BufferStart(); + uint64_t write_offset = next_write_offset_; + size_t left = buf_.CurrentSize(); + + while (left > 0) { + // Check how much is allowed + size_t size; + if (rate_limiter_ != nullptr) { + size = rate_limiter_->RequestToken(left, buf_.Alignment(), + writable_file_->GetIOPriority(), + stats_, RateLimiter::OpType::kWrite); + } else { + size = left; + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + FileOperationInfo::TimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + } + // direct writes must be positional + s = writable_file_->PositionedAppend(Slice(src, size), write_offset); + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s); + } + if (!s.ok()) { + buf_.Size(file_advance + leftover_tail); + return s; + } + } + + IOSTATS_ADD(bytes_written, size); + left -= size; + src += size; + write_offset += size; + assert((next_write_offset_ % alignment) == 0); + } + + if (s.ok()) { + // Move the tail to the beginning of the buffer + // This never happens during normal Append but rather during + // explicit call to Flush()/Sync() or Close() + buf_.RefitTail(file_advance, leftover_tail); + // This is where we start writing next time which may or not be + // the actual file size on disk. They match if the buffer size + // is a multiple of whole pages otherwise filesize_ is leftover_tail + // behind + next_write_offset_ += file_advance; + } + return s; +} +#endif // !ROCKSDB_LITE + +namespace { +class ReadaheadRandomAccessFile : public RandomAccessFile { + public: + ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file, + size_t readahead_size) + : file_(std::move(file)), + alignment_(file_->GetRequiredBufferAlignment()), + readahead_size_(Roundup(readahead_size, alignment_)), + buffer_(), + buffer_offset_(0) { + buffer_.Alignment(alignment_); + buffer_.AllocateNewBuffer(readahead_size_); + } + + ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete; + + ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete; + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + if (n + alignment_ >= readahead_size_) { + return file_->Read(offset, n, result, scratch); + } + + std::unique_lock<std::mutex> lk(lock_); + + size_t cached_len = 0; + // Check if there is a cache hit, means that [offset, offset + n) is either + // completely or partially in the buffer + // If it's completely cached, including end of file case when offset + n is + // greater than EOF, return + if (TryReadFromCache(offset, n, &cached_len, scratch) && + (cached_len == n || + // End of file + buffer_.CurrentSize() < readahead_size_)) { + *result = Slice(scratch, cached_len); + return Status::OK(); + } + size_t advanced_offset = static_cast<size_t>(offset + cached_len); + // In the case of cache hit advanced_offset is already aligned, means that + // chunk_offset equals to advanced_offset + size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); + Slice readahead_result; + + Status s = ReadIntoBuffer(chunk_offset, readahead_size_); + if (s.ok()) { + // In the case of cache miss, i.e. when cached_len equals 0, an offset can + // exceed the file end position, so the following check is required + if (advanced_offset < chunk_offset + buffer_.CurrentSize()) { + // In the case of cache miss, the first chunk_padding bytes in buffer_ + // are + // stored for alignment only and must be skipped + size_t chunk_padding = advanced_offset - chunk_offset; + auto remaining_len = + std::min(buffer_.CurrentSize() - chunk_padding, n - cached_len); + memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding, + remaining_len); + *result = Slice(scratch, cached_len + remaining_len); + } else { + *result = Slice(scratch, cached_len); + } + } + return s; + } + + Status Prefetch(uint64_t offset, size_t n) override { + if (n < readahead_size_) { + // Don't allow smaller prefetches than the configured `readahead_size_`. + // `Read()` assumes a smaller prefetch buffer indicates EOF was reached. + return Status::OK(); + } + size_t offset_ = static_cast<size_t>(offset); + size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_); + if (prefetch_offset == buffer_offset_) { + return Status::OK(); + } + return ReadIntoBuffer(prefetch_offset, + Roundup(offset_ + n, alignment_) - prefetch_offset); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return file_->GetUniqueId(id, max_size); + } + + void Hint(AccessPattern pattern) override { file_->Hint(pattern); } + + Status InvalidateCache(size_t offset, size_t length) override { + return file_->InvalidateCache(offset, length); + } + + bool use_direct_io() const override { return file_->use_direct_io(); } + +private: + bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len, + char* scratch) const { + if (offset < buffer_offset_ || + offset >= buffer_offset_ + buffer_.CurrentSize()) { + *cached_len = 0; + return false; + } + uint64_t offset_in_buffer = offset - buffer_offset_; + *cached_len = std::min( + buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n); + memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len); + return true; + } + + Status ReadIntoBuffer(uint64_t offset, size_t n) const { + if (n > buffer_.Capacity()) { + n = buffer_.Capacity(); + } + assert(IsFileSectorAligned(offset, alignment_)); + assert(IsFileSectorAligned(n, alignment_)); + Slice result; + Status s = file_->Read(offset, n, &result, buffer_.BufferStart()); + if (s.ok()) { + buffer_offset_ = offset; + buffer_.Size(result.size()); + assert(buffer_.BufferStart() == result.data()); + } + return s; + } + + std::unique_ptr<RandomAccessFile> file_; + const size_t alignment_; + size_t readahead_size_; + + mutable std::mutex lock_; + mutable AlignedBuffer buffer_; + mutable uint64_t buffer_offset_; +}; +} // namespace + +Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, + uint64_t offset, size_t n) { + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + size_t offset_ = static_cast<size_t>(offset); + uint64_t rounddown_offset = Rounddown(offset_, alignment); + uint64_t roundup_end = Roundup(offset_ + n, alignment); + uint64_t roundup_len = roundup_end - rounddown_offset; + assert(roundup_len >= alignment); + assert(roundup_len % alignment == 0); + + // Check if requested bytes are in the existing buffer_. + // If all bytes exist -- return. + // If only a few bytes exist -- reuse them & read only what is really needed. + // This is typically the case of incremental reading of data. + // If no bytes exist in buffer -- full pread. + + Status s; + uint64_t chunk_offset_in_buffer = 0; + uint64_t chunk_len = 0; + bool copy_data_to_new_buffer = false; + if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ && + offset <= buffer_offset_ + buffer_.CurrentSize()) { + if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) { + // All requested bytes are already in the buffer. So no need to Read + // again. + return s; + } else { + // Only a few requested bytes are in the buffer. memmove those chunk of + // bytes to the beginning, and memcpy them back into the new buffer if a + // new buffer is created. + chunk_offset_in_buffer = Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment); + chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer; + assert(chunk_offset_in_buffer % alignment == 0); + assert(chunk_len % alignment == 0); + assert(chunk_offset_in_buffer + chunk_len <= + buffer_offset_ + buffer_.CurrentSize()); + if (chunk_len > 0) { + copy_data_to_new_buffer = true; + } else { + // this reset is not necessary, but just to be safe. + chunk_offset_in_buffer = 0; + } + } + } + + // Create a new buffer only if current capacity is not sufficient, and memcopy + // bytes from old buffer if needed (i.e., if chunk_len is greater than 0). + if (buffer_.Capacity() < roundup_len) { + buffer_.Alignment(alignment); + buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len), + copy_data_to_new_buffer, chunk_offset_in_buffer, + static_cast<size_t>(chunk_len)); + } else if (chunk_len > 0) { + // New buffer not needed. But memmove bytes from tail to the beginning since + // chunk_len is greater than 0. + buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer), static_cast<size_t>(chunk_len)); + } + + Slice result; + s = reader->Read(rounddown_offset + chunk_len, + static_cast<size_t>(roundup_len - chunk_len), &result, + buffer_.BufferStart() + chunk_len); + if (s.ok()) { + buffer_offset_ = rounddown_offset; + buffer_.Size(static_cast<size_t>(chunk_len) + result.size()); + } + return s; +} + +bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, + Slice* result) { + if (track_min_offset_ && offset < min_offset_read_) { + min_offset_read_ = static_cast<size_t>(offset); + } + if (!enable_ || offset < buffer_offset_) { + return false; + } + + // If the buffer contains only a few of the requested bytes: + // If readahead is enabled: prefetch the remaining bytes + readadhead bytes + // and satisfy the request. + // If readahead is not enabled: return false. + if (offset + n > buffer_offset_ + buffer_.CurrentSize()) { + if (readahead_size_ > 0) { + assert(file_reader_ != nullptr); + assert(max_readahead_size_ >= readahead_size_); + + Status s = Prefetch(file_reader_, offset, n + readahead_size_); + if (!s.ok()) { + return false; + } + readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); + } else { + return false; + } + } + + uint64_t offset_in_buffer = offset - buffer_offset_; + *result = Slice(buffer_.BufferStart() + offset_in_buffer, n); + return true; +} + +std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile( + std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) { + std::unique_ptr<RandomAccessFile> result( + new ReadaheadRandomAccessFile(std::move(file), readahead_size)); + return result; +} + +Status NewWritableFile(Env* env, const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) { + Status s = env->NewWritableFile(fname, result, options); + TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2); + return s; +} + +bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file, + std::string* output, bool* has_data, Status* result) { + const int kBufferSize = 8192; + char buffer[kBufferSize + 1]; + Slice input_slice; + + std::string line; + bool has_complete_line = false; + while (!has_complete_line) { + if (std::getline(*iss, line)) { + has_complete_line = !iss->eof(); + } else { + has_complete_line = false; + } + if (!has_complete_line) { + // if we're not sure whether we have a complete line, + // further read from the file. + if (*has_data) { + *result = seq_file->Read(kBufferSize, &input_slice, buffer); + } + if (input_slice.size() == 0) { + // meaning we have read all the data + *has_data = false; + break; + } else { + iss->str(line + input_slice.ToString()); + // reset the internal state of iss so that we can keep reading it. + iss->clear(); + *has_data = (input_slice.size() == kBufferSize); + continue; + } + } + } + *output = line; + return *has_data || has_complete_line; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/file_reader_writer.h b/src/rocksdb/util/file_reader_writer.h new file mode 100644 index 00000000..4451f8b8 --- /dev/null +++ b/src/rocksdb/util/file_reader_writer.h @@ -0,0 +1,326 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include <atomic> +#include <sstream> +#include <string> +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/rate_limiter.h" +#include "util/aligned_buffer.h" +#include "util/sync_point.h" + +namespace rocksdb { + +class Statistics; +class HistogramImpl; + +std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile( + std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size); + +class SequentialFileReader { + private: + std::unique_ptr<SequentialFile> file_; + std::string file_name_; + std::atomic<size_t> offset_; // read offset + + public: + explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file, + const std::string& _file_name) + : file_(std::move(_file)), file_name_(_file_name), offset_(0) {} + + SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT { + *this = std::move(o); + } + + SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT { + file_ = std::move(o.file_); + return *this; + } + + SequentialFileReader(const SequentialFileReader&) = delete; + SequentialFileReader& operator=(const SequentialFileReader&) = delete; + + Status Read(size_t n, Slice* result, char* scratch); + + Status Skip(uint64_t n); + + void Rewind(); + + SequentialFile* file() { return file_.get(); } + + std::string file_name() { return file_name_; } + + bool use_direct_io() const { return file_->use_direct_io(); } +}; + +class RandomAccessFileReader { + private: +#ifndef ROCKSDB_LITE + void NotifyOnFileReadFinish(uint64_t offset, size_t length, + const FileOperationInfo::TimePoint& start_ts, + const FileOperationInfo::TimePoint& finish_ts, + const Status& status) const { + FileOperationInfo info(file_name_, start_ts, finish_ts); + info.offset = offset; + info.length = length; + info.status = status; + + for (auto& listener : listeners_) { + listener->OnFileReadFinish(info); + } + } +#endif // ROCKSDB_LITE + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + + std::unique_ptr<RandomAccessFile> file_; + std::string file_name_; + Env* env_; + Statistics* stats_; + uint32_t hist_type_; + HistogramImpl* file_read_hist_; + RateLimiter* rate_limiter_; + bool for_compaction_; + std::vector<std::shared_ptr<EventListener>> listeners_; + + public: + explicit RandomAccessFileReader( + std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name, + Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0, + HistogramImpl* file_read_hist = nullptr, + RateLimiter* rate_limiter = nullptr, bool for_compaction = false, + const std::vector<std::shared_ptr<EventListener>>& listeners = {}) + : file_(std::move(raf)), + file_name_(std::move(_file_name)), + env_(env), + stats_(stats), + hist_type_(hist_type), + file_read_hist_(file_read_hist), + rate_limiter_(rate_limiter), + for_compaction_(for_compaction), + listeners_() { +#ifndef ROCKSDB_LITE + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr<EventListener>& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); +#else // !ROCKSDB_LITE + (void)listeners; +#endif + } + + RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT { + *this = std::move(o); + } + + RandomAccessFileReader& operator=(RandomAccessFileReader&& o) + ROCKSDB_NOEXCEPT { + file_ = std::move(o.file_); + env_ = std::move(o.env_); + stats_ = std::move(o.stats_); + hist_type_ = std::move(o.hist_type_); + file_read_hist_ = std::move(o.file_read_hist_); + rate_limiter_ = std::move(o.rate_limiter_); + for_compaction_ = std::move(o.for_compaction_); + return *this; + } + + RandomAccessFileReader(const RandomAccessFileReader&) = delete; + RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; + + Status Prefetch(uint64_t offset, size_t n) const { + return file_->Prefetch(offset, n); + } + + RandomAccessFile* file() { return file_.get(); } + + std::string file_name() const { return file_name_; } + + bool use_direct_io() const { return file_->use_direct_io(); } +}; + +// Use posix write to write data to a file. +class WritableFileWriter { + private: +#ifndef ROCKSDB_LITE + void NotifyOnFileWriteFinish(uint64_t offset, size_t length, + const FileOperationInfo::TimePoint& start_ts, + const FileOperationInfo::TimePoint& finish_ts, + const Status& status) { + FileOperationInfo info(file_name_, start_ts, finish_ts); + info.offset = offset; + info.length = length; + info.status = status; + + for (auto& listener : listeners_) { + listener->OnFileWriteFinish(info); + } + } +#endif // ROCKSDB_LITE + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + + std::unique_ptr<WritableFile> writable_file_; + std::string file_name_; + Env* env_; + AlignedBuffer buf_; + size_t max_buffer_size_; + // Actually written data size can be used for truncate + // not counting padding data + uint64_t filesize_; +#ifndef ROCKSDB_LITE + // This is necessary when we use unbuffered access + // and writes must happen on aligned offsets + // so we need to go back and write that page again + uint64_t next_write_offset_; +#endif // ROCKSDB_LITE + bool pending_sync_; + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; + RateLimiter* rate_limiter_; + Statistics* stats_; + std::vector<std::shared_ptr<EventListener>> listeners_; + + public: + WritableFileWriter( + std::unique_ptr<WritableFile>&& file, const std::string& _file_name, + const EnvOptions& options, Env* env = nullptr, + Statistics* stats = nullptr, + const std::vector<std::shared_ptr<EventListener>>& listeners = {}) + : writable_file_(std::move(file)), + file_name_(_file_name), + env_(env), + buf_(), + max_buffer_size_(options.writable_file_max_buffer_size), + filesize_(0), +#ifndef ROCKSDB_LITE + next_write_offset_(0), +#endif // ROCKSDB_LITE + pending_sync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync), + rate_limiter_(options.rate_limiter), + stats_(stats), + listeners_() { + TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0", + reinterpret_cast<void*>(max_buffer_size_)); + buf_.Alignment(writable_file_->GetRequiredBufferAlignment()); + buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_)); +#ifndef ROCKSDB_LITE + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr<EventListener>& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); +#else // !ROCKSDB_LITE + (void)listeners; +#endif + } + + WritableFileWriter(const WritableFileWriter&) = delete; + + WritableFileWriter& operator=(const WritableFileWriter&) = delete; + + ~WritableFileWriter() { Close(); } + + std::string file_name() const { return file_name_; } + + Status Append(const Slice& data); + + Status Pad(const size_t pad_bytes); + + Status Flush(); + + Status Close(); + + Status Sync(bool use_fsync); + + // Sync only the data that was already Flush()ed. Safe to call concurrently + // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(), + // returns NotSupported status. + Status SyncWithoutFlush(bool use_fsync); + + uint64_t GetFileSize() { return filesize_; } + + Status InvalidateCache(size_t offset, size_t length) { + return writable_file_->InvalidateCache(offset, length); + } + + WritableFile* writable_file() const { return writable_file_.get(); } + + bool use_direct_io() { return writable_file_->use_direct_io(); } + + bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; } + + private: + // Used when os buffering is OFF and we are writing + // DMA such as in Direct I/O mode +#ifndef ROCKSDB_LITE + Status WriteDirect(); +#endif // !ROCKSDB_LITE + // Normal write + Status WriteBuffered(const char* data, size_t size); + Status RangeSync(uint64_t offset, uint64_t nbytes); + Status SyncInternal(bool use_fsync); +}; + +// FilePrefetchBuffer can automatically do the readahead if file_reader, +// readahead_size, and max_readahead_size are passed in. +// max_readahead_size should be greater than or equal to readahead_size. +// readahead_size will be doubled on every IO, until max_readahead_size. +class FilePrefetchBuffer { + public: + // If `track_min_offset` is true, track minimum offset ever read. + FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr, + size_t readadhead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false) + : buffer_offset_(0), + file_reader_(file_reader), + readahead_size_(readadhead_size), + max_readahead_size_(max_readahead_size), + min_offset_read_(port::kMaxSizet), + enable_(enable), + track_min_offset_(track_min_offset) {} + Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n); + bool TryReadFromCache(uint64_t offset, size_t n, Slice* result); + + // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked + // if track_min_offset = true. + size_t min_offset_read() const { return min_offset_read_; } + + private: + AlignedBuffer buffer_; + uint64_t buffer_offset_; + RandomAccessFileReader* file_reader_; + size_t readahead_size_; + size_t max_readahead_size_; + // The minimum `offset` ever passed to TryReadFromCache(). + size_t min_offset_read_; + // if false, TryReadFromCache() always return false, and we only take stats + // for track_min_offset_ if track_min_offset_ = true + bool enable_; + // If true, track minimum `offset` ever passed to TryReadFromCache(), which + // can be fetched from min_offset_read(). + bool track_min_offset_; +}; + +extern Status NewWritableFile(Env* env, const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options); +bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file, + std::string* output, bool* has_data, Status* result); + +} // namespace rocksdb diff --git a/src/rocksdb/util/file_reader_writer_test.cc b/src/rocksdb/util/file_reader_writer_test.cc new file mode 100644 index 00000000..6a7ea6d7 --- /dev/null +++ b/src/rocksdb/util/file_reader_writer_test.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "util/file_reader_writer.h" +#include <algorithm> +#include <vector> +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class WritableFileWriterTest : public testing::Test {}; + +const uint32_t kMb = 1 << 20; + +TEST_F(WritableFileWriterTest, RangeSync) { + class FakeWF : public WritableFile { + public: + explicit FakeWF() : size_(0), last_synced_(0) {} + ~FakeWF() override {} + + Status Append(const Slice& data) override { + size_ += data.size(); + return Status::OK(); + } + Status Truncate(uint64_t /*size*/) override { return Status::OK(); } + Status Close() override { + EXPECT_GE(size_, last_synced_ + kMb); + EXPECT_LT(size_, last_synced_ + 2 * kMb); + // Make sure random writes generated enough writes. + EXPECT_GT(size_, 10 * kMb); + return Status::OK(); + } + Status Flush() override { return Status::OK(); } + Status Sync() override { return Status::OK(); } + Status Fsync() override { return Status::OK(); } + void SetIOPriority(Env::IOPriority /*pri*/) override {} + uint64_t GetFileSize() override { return size_; } + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override {} + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + return 0; + } + Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override { + return Status::OK(); + } + + protected: + Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override { + return Status::OK(); + } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + EXPECT_EQ(offset % 4096, 0u); + EXPECT_EQ(nbytes % 4096, 0u); + + EXPECT_EQ(offset, last_synced_); + last_synced_ = offset + nbytes; + EXPECT_GE(size_, last_synced_ + kMb); + if (size_ > 2 * kMb) { + EXPECT_LT(size_, last_synced_ + 2 * kMb); + } + return Status::OK(); + } + + uint64_t size_; + uint64_t last_synced_; + }; + + EnvOptions env_options; + env_options.bytes_per_sync = kMb; + std::unique_ptr<FakeWF> wf(new FakeWF); + std::unique_ptr<WritableFileWriter> writer( + new WritableFileWriter(std::move(wf), "" /* don't care */, env_options)); + Random r(301); + std::unique_ptr<char[]> large_buf(new char[10 * kMb]); + for (int i = 0; i < 1000; i++) { + int skew_limit = (i < 700) ? 10 : 15; + uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); + writer->Append(Slice(large_buf.get(), num)); + + // Flush in a chance of 1/10. + if (r.Uniform(10) == 0) { + writer->Flush(); + } + } + writer->Close(); +} + +TEST_F(WritableFileWriterTest, IncrementalBuffer) { + class FakeWF : public WritableFile { + public: + explicit FakeWF(std::string* _file_data, bool _use_direct_io, + bool _no_flush) + : file_data_(_file_data), + use_direct_io_(_use_direct_io), + no_flush_(_no_flush) {} + ~FakeWF() override {} + + Status Append(const Slice& data) override { + file_data_->append(data.data(), data.size()); + size_ += data.size(); + return Status::OK(); + } + Status PositionedAppend(const Slice& data, uint64_t pos) override { + EXPECT_TRUE(pos % 512 == 0); + EXPECT_TRUE(data.size() % 512 == 0); + file_data_->resize(pos); + file_data_->append(data.data(), data.size()); + size_ += data.size(); + return Status::OK(); + } + + Status Truncate(uint64_t size) override { + file_data_->resize(size); + return Status::OK(); + } + Status Close() override { return Status::OK(); } + Status Flush() override { return Status::OK(); } + Status Sync() override { return Status::OK(); } + Status Fsync() override { return Status::OK(); } + void SetIOPriority(Env::IOPriority /*pri*/) override {} + uint64_t GetFileSize() override { return size_; } + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override {} + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + return 0; + } + Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override { + return Status::OK(); + } + bool use_direct_io() const override { return use_direct_io_; } + + std::string* file_data_; + bool use_direct_io_; + bool no_flush_; + size_t size_ = 0; + }; + + Random r(301); + const int kNumAttempts = 50; + for (int attempt = 0; attempt < kNumAttempts; attempt++) { + bool no_flush = (attempt % 3 == 0); + EnvOptions env_options; + env_options.writable_file_max_buffer_size = + (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024; + std::string actual; + std::unique_ptr<FakeWF> wf(new FakeWF(&actual, +#ifndef ROCKSDB_LITE + attempt % 2 == 1, +#else + false, +#endif + no_flush)); + std::unique_ptr<WritableFileWriter> writer(new WritableFileWriter( + std::move(wf), "" /* don't care */, env_options)); + + std::string target; + for (int i = 0; i < 20; i++) { + uint32_t num = r.Skewed(16) * 100 + r.Uniform(100); + std::string random_string; + test::RandomString(&r, num, &random_string); + writer->Append(Slice(random_string.c_str(), num)); + target.append(random_string.c_str(), num); + + // In some attempts, flush in a chance of 1/10. + if (!no_flush && r.Uniform(10) == 0) { + writer->Flush(); + } + } + writer->Flush(); + writer->Close(); + ASSERT_EQ(target.size(), actual.size()); + ASSERT_EQ(target, actual); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(WritableFileWriterTest, AppendStatusReturn) { + class FakeWF : public WritableFile { + public: + explicit FakeWF() : use_direct_io_(false), io_error_(false) {} + + bool use_direct_io() const override { return use_direct_io_; } + Status Append(const Slice& /*data*/) override { + if (io_error_) { + return Status::IOError("Fake IO error"); + } + return Status::OK(); + } + Status PositionedAppend(const Slice& /*data*/, uint64_t) override { + if (io_error_) { + return Status::IOError("Fake IO error"); + } + return Status::OK(); + } + Status Close() override { return Status::OK(); } + Status Flush() override { return Status::OK(); } + Status Sync() override { return Status::OK(); } + void Setuse_direct_io(bool val) { use_direct_io_ = val; } + void SetIOError(bool val) { io_error_ = val; } + + protected: + bool use_direct_io_; + bool io_error_; + }; + std::unique_ptr<FakeWF> wf(new FakeWF()); + wf->Setuse_direct_io(true); + std::unique_ptr<WritableFileWriter> writer( + new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); + + ASSERT_OK(writer->Append(std::string(2 * kMb, 'a'))); + + // Next call to WritableFile::Append() should fail + dynamic_cast<FakeWF*>(writer->writable_file())->SetIOError(true); + ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b'))); +} +#endif + +class ReadaheadRandomAccessFileTest + : public testing::Test, + public testing::WithParamInterface<size_t> { + public: + static std::vector<size_t> GetReadaheadSizeList() { + return {1lu << 12, 1lu << 16}; + } + void SetUp() override { + readahead_size_ = GetParam(); + scratch_.reset(new char[2 * readahead_size_]); + ResetSourceStr(); + } + ReadaheadRandomAccessFileTest() : control_contents_() {} + std::string Read(uint64_t offset, size_t n) { + Slice result; + test_read_holder_->Read(offset, n, &result, scratch_.get()); + return std::string(result.data(), result.size()); + } + void ResetSourceStr(const std::string& str = "") { + auto write_holder = + std::unique_ptr<WritableFileWriter>(test::GetWritableFileWriter( + new test::StringSink(&control_contents_), "" /* don't care */)); + write_holder->Append(Slice(str)); + write_holder->Flush(); + auto read_holder = std::unique_ptr<RandomAccessFile>( + new test::StringSource(control_contents_)); + test_read_holder_ = + NewReadaheadRandomAccessFile(std::move(read_holder), readahead_size_); + } + size_t GetReadaheadSize() const { return readahead_size_; } + + private: + size_t readahead_size_; + Slice control_contents_; + std::unique_ptr<RandomAccessFile> test_read_holder_; + std::unique_ptr<char[]> scratch_; +}; + +TEST_P(ReadaheadRandomAccessFileTest, EmptySourceStrTest) { + ASSERT_EQ("", Read(0, 1)); + ASSERT_EQ("", Read(0, 0)); + ASSERT_EQ("", Read(13, 13)); +} + +TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSizeTest) { + std::string str = "abcdefghijklmnopqrs"; + ResetSourceStr(str); + ASSERT_EQ(str.substr(3, 4), Read(3, 4)); + ASSERT_EQ(str.substr(0, 3), Read(0, 3)); + ASSERT_EQ(str, Read(0, str.size())); + ASSERT_EQ(str.substr(7, std::min(static_cast<int>(str.size()) - 7, 30)), + Read(7, 30)); + ASSERT_EQ("", Read(100, 100)); +} + +TEST_P(ReadaheadRandomAccessFileTest, + SourceStrLenCanBeGreaterThanReadaheadSizeTest) { + Random rng(42); + for (int k = 0; k < 100; ++k) { + size_t strLen = k * GetReadaheadSize() + + rng.Uniform(static_cast<int>(GetReadaheadSize())); + std::string str = + test::RandomHumanReadableString(&rng, static_cast<int>(strLen)); + ResetSourceStr(str); + for (int test = 1; test <= 100; ++test) { + size_t offset = rng.Uniform(static_cast<int>(strLen)); + size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize())); + ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)), + Read(offset, n)); + } + } +} + +TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) { + Random rng(7); + size_t strLen = 4 * GetReadaheadSize() + + rng.Uniform(static_cast<int>(GetReadaheadSize())); + std::string str = + test::RandomHumanReadableString(&rng, static_cast<int>(strLen)); + ResetSourceStr(str); + for (int test = 1; test <= 100; ++test) { + size_t offset = rng.Uniform(static_cast<int>(strLen)); + size_t n = + GetReadaheadSize() + rng.Uniform(static_cast<int>(GetReadaheadSize())); + ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)), + Read(offset, n)); + } +} + +INSTANTIATE_TEST_CASE_P( + EmptySourceStrTest, ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenLessThanReadaheadSizeTest, ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenCanBeGreaterThanReadaheadSizeTest, + ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + NExceedReadaheadTest, ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/file_util.cc b/src/rocksdb/util/file_util.cc new file mode 100644 index 00000000..ba1b4744 --- /dev/null +++ b/src/rocksdb/util/file_util.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "util/file_util.h" + +#include <string> +#include <algorithm> + +#include "rocksdb/env.h" +#include "util/sst_file_manager_impl.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { + +// Utility function to copy a file up to a specified length +Status CopyFile(Env* env, const std::string& source, + const std::string& destination, uint64_t size, bool use_fsync) { + const EnvOptions soptions; + Status s; + std::unique_ptr<SequentialFileReader> src_reader; + std::unique_ptr<WritableFileWriter> dest_writer; + + { + std::unique_ptr<SequentialFile> srcfile; + s = env->NewSequentialFile(source, &srcfile, soptions); + if (!s.ok()) { + return s; + } + std::unique_ptr<WritableFile> destfile; + s = env->NewWritableFile(destination, &destfile, soptions); + if (!s.ok()) { + return s; + } + + if (size == 0) { + // default argument means copy everything + s = env->GetFileSize(source, &size); + if (!s.ok()) { + return s; + } + } + src_reader.reset(new SequentialFileReader(std::move(srcfile), source)); + dest_writer.reset( + new WritableFileWriter(std::move(destfile), destination, soptions)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size)); + s = src_reader->Read(bytes_to_read, &slice, buffer); + if (!s.ok()) { + return s; + } + if (slice.size() == 0) { + return Status::Corruption("file too small"); + } + s = dest_writer->Append(slice); + if (!s.ok()) { + return s; + } + size -= slice.size(); + } + return dest_writer->Sync(use_fsync); +} + +// Utility function to create a file with the provided contents +Status CreateFile(Env* env, const std::string& destination, + const std::string& contents, bool use_fsync) { + const EnvOptions soptions; + Status s; + std::unique_ptr<WritableFileWriter> dest_writer; + + std::unique_ptr<WritableFile> destfile; + s = env->NewWritableFile(destination, &destfile, soptions); + if (!s.ok()) { + return s; + } + dest_writer.reset( + new WritableFileWriter(std::move(destfile), destination, soptions)); + s = dest_writer->Append(Slice(contents)); + if (!s.ok()) { + return s; + } + return dest_writer->Sync(use_fsync); +} + +Status DeleteDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, const std::string& dir_to_sync, + const bool force_bg) { +#ifndef ROCKSDB_LITE + SstFileManagerImpl* sfm = + static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get()); + if (sfm) { + return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg); + } else { + return db_options->env->DeleteFile(fname); + } +#else + (void)dir_to_sync; + (void)force_bg; + // SstFileManager is not supported in ROCKSDB_LITE + // Delete file immediately + return db_options->env->DeleteFile(fname); +#endif +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/file_util.h b/src/rocksdb/util/file_util.h new file mode 100644 index 00000000..c3b365c8 --- /dev/null +++ b/src/rocksdb/util/file_util.h @@ -0,0 +1,30 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include <string> + +#include "options/db_options.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "util/filename.h" + +namespace rocksdb { +// use_fsync maps to options.use_fsync, which determines the way that +// the file is synced after copying. +extern Status CopyFile(Env* env, const std::string& source, + const std::string& destination, uint64_t size, + bool use_fsync); + +extern Status CreateFile(Env* env, const std::string& destination, + const std::string& contents, bool use_fsync); + +extern Status DeleteDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, + const std::string& path_to_sync, + const bool force_bg = false); + +} // namespace rocksdb diff --git a/src/rocksdb/util/filelock_test.cc b/src/rocksdb/util/filelock_test.cc new file mode 100644 index 00000000..f8721b59 --- /dev/null +++ b/src/rocksdb/util/filelock_test.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "rocksdb/status.h" +#include "rocksdb/env.h" + +#include <vector> +#include <fcntl.h> +#include "util/coding.h" +#include "util/testharness.h" + +namespace rocksdb { + +class LockTest : public testing::Test { + public: + static LockTest* current_; + std::string file_; + rocksdb::Env* env_; + + LockTest() + : file_(test::PerThreadDBPath("db_testlock_file")), + env_(rocksdb::Env::Default()) { + current_ = this; + } + + ~LockTest() override {} + + Status LockFile(FileLock** db_lock) { + return env_->LockFile(file_, db_lock); + } + + Status UnlockFile(FileLock* db_lock) { + return env_->UnlockFile(db_lock); + } + + bool AssertFileIsLocked(){ + return CheckFileLock( /* lock_expected = */ true); + } + + bool AssertFileIsNotLocked(){ + return CheckFileLock( /* lock_expected = */ false); + } + + bool CheckFileLock(bool lock_expected){ + // We need to fork to check the fcntl lock as we need + // to open and close the file from a different process + // to avoid either releasing the lock on close, or not + // contending for it when requesting a lock. + +#ifdef OS_WIN + + // WaitForSingleObject and GetExitCodeProcess can do what waitpid does. + // TODO - implement on Windows + return true; + +#else + + pid_t pid = fork(); + if ( 0 == pid ) { + // child process + int exit_val = EXIT_FAILURE; + int fd = open(file_.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + // could not open file, could not check if it was locked + fprintf( stderr, "Open on on file %s failed.\n",file_.c_str()); + exit(exit_val); + } + + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (F_WRLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + int value = fcntl(fd, F_SETLK, &f); + if( value == -1 ){ + if( lock_expected ){ + exit_val = EXIT_SUCCESS; + } + } else { + if( ! lock_expected ){ + exit_val = EXIT_SUCCESS; + } + } + close(fd); // lock is released for child process + exit(exit_val); + } else if (pid > 0) { + // parent process + int status; + while (-1 == waitpid(pid, &status, 0)); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + // child process exited with non success status + return false; + } else { + return true; + } + } else { + fprintf( stderr, "Fork failed\n" ); + return false; + } + return false; + +#endif + + } + +}; +LockTest* LockTest::current_; + +TEST_F(LockTest, LockBySameThread) { + FileLock* lock1; + FileLock* lock2; + + // acquire a lock on a file + ASSERT_OK(LockFile(&lock1)); + + // check the file is locked + ASSERT_TRUE( AssertFileIsLocked() ); + + // re-acquire the lock on the same file. This should fail. + ASSERT_TRUE(LockFile(&lock2).IsIOError()); + + // check the file is locked + ASSERT_TRUE( AssertFileIsLocked() ); + + // release the lock + ASSERT_OK(UnlockFile(lock1)); + + // check the file is not locked + ASSERT_TRUE( AssertFileIsNotLocked() ); + +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/filename.cc b/src/rocksdb/util/filename.cc new file mode 100644 index 00000000..32289aec --- /dev/null +++ b/src/rocksdb/util/filename.cc @@ -0,0 +1,410 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "util/filename.h" +#include <inttypes.h> + +#include <ctype.h> +#include <stdio.h> +#include <vector> +#include "rocksdb/env.h" +#include "util/file_reader_writer.h" +#include "util/logging.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/sync_point.h" + +namespace rocksdb { + +static const std::string kRocksDbTFileExt = "sst"; +static const std::string kLevelDbTFileExt = "ldb"; +static const std::string kRocksDBBlobFileExt = "blob"; + +// Given a path, flatten the path name by replacing all chars not in +// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end. +// Return the number of chars stored in dest not including the trailing '\0'. +static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) { + const char suffix[] = "_LOG"; + + size_t write_idx = 0; + size_t i = 0; + size_t src_len = path.size(); + + while (i < src_len && write_idx < len - sizeof(suffix)) { + if ((path[i] >= 'a' && path[i] <= 'z') || + (path[i] >= '0' && path[i] <= '9') || + (path[i] >= 'A' && path[i] <= 'Z') || + path[i] == '-' || + path[i] == '.' || + path[i] == '_'){ + dest[write_idx++] = path[i]; + } else { + if (i > 0) { + dest[write_idx++] = '_'; + } + } + i++; + } + assert(sizeof(suffix) <= len - write_idx); + // "\0" is automatically added by snprintf + snprintf(dest + write_idx, len - write_idx, suffix); + write_idx += sizeof(suffix) - 1; + return write_idx; +} + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast<unsigned long long>(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string BlobFileName(const std::string& blobdirname, uint64_t number) { + assert(number > 0); + return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str()); +} + +std::string BlobFileName(const std::string& dbname, const std::string& blob_dir, + uint64_t number) { + assert(number > 0); + return MakeFileName(dbname + "/" + blob_dir, number, + kRocksDBBlobFileExt.c_str()); +} + +std::string ArchivalDirectory(const std::string& dir) { + return dir + "/" + ARCHIVAL_DIR; +} +std::string ArchivedLogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log"); +} + +std::string MakeTableFileName(const std::string& path, uint64_t number) { + return MakeFileName(path, number, kRocksDbTFileExt.c_str()); +} + +std::string Rocks2LevelTableFileName(const std::string& fullname) { + assert(fullname.size() > kRocksDbTFileExt.size() + 1); + if (fullname.size() <= kRocksDbTFileExt.size() + 1) { + return ""; + } + return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) + + kLevelDbTFileExt; +} + +uint64_t TableFileNameToNumber(const std::string& name) { + uint64_t number = 0; + uint64_t base = 1; + int pos = static_cast<int>(name.find_last_of('.')); + while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') { + number += (name[pos] - '0') * base; + base *= 10; + } + return number; +} + +std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number, + uint32_t path_id) { + assert(number > 0); + std::string path; + if (path_id >= db_paths.size()) { + path = db_paths.back().path; + } else { + path = db_paths[path_id].path; + } + return MakeTableFileName(path, number); +} + +void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf, + size_t out_buf_size) { + if (path_id == 0) { + snprintf(out_buf, out_buf_size, "%" PRIu64, number); + } else { + snprintf(out_buf, out_buf_size, "%" PRIu64 + "(path " + "%" PRIu32 ")", + number, path_id); + } +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast<unsigned long long>(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + return MakeFileName(dbname, number, kTempFileNameSuffix.c_str()); +} + +InfoLogPrefix::InfoLogPrefix(bool has_log_dir, + const std::string& db_absolute_path) { + if (!has_log_dir) { + const char kInfoLogPrefix[] = "LOG"; + // "\0" is automatically added to the end + snprintf(buf, sizeof(buf), kInfoLogPrefix); + prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1); + } else { + size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf)); + prefix = Slice(buf, len); + } +} + +std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path, const std::string& log_dir) { + if (log_dir.empty()) { + return dbname + "/LOG"; + } + + InfoLogPrefix info_log_prefix(true, db_path); + return log_dir + "/" + info_log_prefix.buf; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path, const std::string& log_dir) { + char buf[50]; + snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts)); + + if (log_dir.empty()) { + return dbname + "/LOG.old." + buf; + } + + InfoLogPrefix info_log_prefix(true, db_path); + return log_dir + "/" + info_log_prefix.buf + ".old." + buf; +} + +std::string OptionsFileName(const std::string& dbname, uint64_t file_num) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%06" PRIu64, + kOptionsFileNamePrefix.c_str(), file_num); + return dbname + "/" + buffer; +} + +std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s", + kOptionsFileNamePrefix.c_str(), file_num, + kTempFileNameSuffix.c_str()); + return dbname + "/" + buffer; +} + +std::string MetaDatabaseName(const std::string& dbname, uint64_t number) { + char buf[100]; + snprintf(buf, sizeof(buf), "/METADB-%llu", + static_cast<unsigned long long>(number)); + return dbname + buf; +} + +std::string IdentityFileName(const std::string& dbname) { + return dbname + "/IDENTITY"; +} + +// Owned filenames have the form: +// dbname/IDENTITY +// dbname/CURRENT +// dbname/LOCK +// dbname/<info_log_name_prefix> +// dbname/<info_log_name_prefix>.old.[0-9]+ +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9]+.(log|sst|blob) +// dbname/METADB-[0-9]+ +// dbname/OPTIONS-[0-9]+ +// dbname/OPTIONS-[0-9]+.dbtmp +// Disregards / at the beginning +bool ParseFileName(const std::string& fname, + uint64_t* number, + FileType* type, + WalFileType* log_type) { + return ParseFileName(fname, number, "", type, log_type); +} + +bool ParseFileName(const std::string& fname, uint64_t* number, + const Slice& info_log_name_prefix, FileType* type, + WalFileType* log_type) { + Slice rest(fname); + if (fname.length() > 1 && fname[0] == '/') { + rest.remove_prefix(1); + } + if (rest == "IDENTITY") { + *number = 0; + *type = kIdentityFile; + } else if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (info_log_name_prefix.size() > 0 && + rest.starts_with(info_log_name_prefix)) { + rest.remove_prefix(info_log_name_prefix.size()); + if (rest == "" || rest == ".old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.starts_with(".old.")) { + uint64_t ts_suffix; + // sizeof also counts the trailing '\0'. + rest.remove_prefix(sizeof(".old.") - 1); + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = kInfoLogFile; + } + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else if (rest.starts_with("METADB-")) { + rest.remove_prefix(strlen("METADB-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kMetaDatabase; + *number = num; + } else if (rest.starts_with(kOptionsFileNamePrefix)) { + uint64_t ts_suffix; + bool is_temp_file = false; + rest.remove_prefix(kOptionsFileNamePrefix.size()); + const std::string kTempFileNameSuffixWithDot = + std::string(".") + kTempFileNameSuffix; + if (rest.ends_with(kTempFileNameSuffixWithDot)) { + rest.remove_suffix(kTempFileNameSuffixWithDot.size()); + is_temp_file = true; + } + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = is_temp_file ? kTempFile : kOptionsFile; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + bool archive_dir_found = false; + if (rest.starts_with(ARCHIVAL_DIR)) { + if (rest.size() <= ARCHIVAL_DIR.size()) { + return false; + } + rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also + if (log_type) { + *log_type = kArchivedLogFile; + } + archive_dir_found = true; + } + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (rest.size() <= 1 || rest[0] != '.') { + return false; + } + rest.remove_prefix(1); + + Slice suffix = rest; + if (suffix == Slice("log")) { + *type = kLogFile; + if (log_type && !archive_dir_found) { + *log_type = kAliveLogFile; + } + } else if (archive_dir_found) { + return false; // Archive dir can contain only log files + } else if (suffix == Slice(kRocksDbTFileExt) || + suffix == Slice(kLevelDbTFileExt)) { + *type = kTableFile; + } else if (suffix == Slice(kRocksDBBlobFileExt)) { + *type = kBlobFile; + } else if (suffix == Slice(kTempFileNameSuffix)) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number, + Directory* directory_to_fsync) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true); + if (s.ok()) { + TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2); + s = env->RenameFile(tmp, CurrentFileName(dbname)); + TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2); + } + if (s.ok()) { + if (directory_to_fsync != nullptr) { + s = directory_to_fsync->Fsync(); + } + } else { + env->DeleteFile(tmp); + } + return s; +} + +Status SetIdentityFile(Env* env, const std::string& dbname) { + std::string id = env->GenerateUniqueId(); + assert(!id.empty()); + // Reserve the filename dbname/000000.dbtmp for the temporary identity file + std::string tmp = TempFileName(dbname, 0); + Status s = WriteStringToFile(env, id, tmp, true); + if (s.ok()) { + s = env->RenameFile(tmp, IdentityFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, + WritableFileWriter* file) { + TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2); + StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); + return file->Sync(db_options->use_fsync); +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/filename.h b/src/rocksdb/util/filename.h new file mode 100644 index 00000000..eea6b1b0 --- /dev/null +++ b/src/rocksdb/util/filename.h @@ -0,0 +1,172 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#pragma once +#include <stdint.h> +#include <unordered_map> +#include <string> +#include <vector> + +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" + +namespace rocksdb { + +class Env; +class Directory; +class WritableFileWriter; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile, + kOptionsFile, + kBlobFile +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +extern std::string BlobFileName(const std::string& bdirname, uint64_t number); + +extern std::string BlobFileName(const std::string& dbname, + const std::string& blob_dir, uint64_t number); + +static const std::string ARCHIVAL_DIR = "archive"; + +extern std::string ArchivalDirectory(const std::string& dbname); + +// Return the name of the archived log file with the specified number +// in the db named by "dbname". The result will be prefixed with "dbname". +extern std::string ArchivedLogFileName(const std::string& dbname, + uint64_t num); + +extern std::string MakeTableFileName(const std::string& name, uint64_t number); + +// Return the name of sstable with LevelDB suffix +// created from RocksDB sstable suffixed name +extern std::string Rocks2LevelTableFileName(const std::string& fullname); + +// the reverse function of MakeTableFileName +// TODO(yhchiang): could merge this function with ParseFileName() +extern uint64_t TableFileNameToNumber(const std::string& name); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::vector<DbPath>& db_paths, + uint64_t number, uint32_t path_id); + +// Sufficient buffer size for FormatFileNumber. +const size_t kFormatFileNumberBufSize = 38; + +extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf, + size_t out_buf_size); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// A helper structure for prefix of info log names. +struct InfoLogPrefix { + char buf[260]; + Slice prefix; + // Prefix with DB absolute path encoded + explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path); + // Default Prefix + explicit InfoLogPrefix(); +}; + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path = "", + const std::string& log_dir = ""); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path = "", + const std::string& log_dir = ""); + +static const std::string kOptionsFileNamePrefix = "OPTIONS-"; +static const std::string kTempFileNameSuffix = "dbtmp"; + +// Return a options file name given the "dbname" and file number. +// Format: OPTIONS-[number].dbtmp +extern std::string OptionsFileName(const std::string& dbname, + uint64_t file_num); + +// Return a temp options file name given the "dbname" and file number. +// Format: OPTIONS-[number] +extern std::string TempOptionsFileName(const std::string& dbname, + uint64_t file_num); + +// Return the name to use for a metadatabase. The result will be prefixed with +// "dbname". +extern std::string MetaDatabaseName(const std::string& dbname, + uint64_t number); + +// Return the name of the Identity file which stores a unique number for the db +// that will get regenerated if the db loses all its data and is recreated fresh +// either from a backup-image or empty +extern std::string IdentityFileName(const std::string& dbname); + +// If filename is a rocksdb file, store the type of the file in *type. +// The number encoded in the filename is stored in *number. If the +// filename was successfully parsed, returns true. Else return false. +// info_log_name_prefix is the path of info logs. +extern bool ParseFileName(const std::string& filename, uint64_t* number, + const Slice& info_log_name_prefix, FileType* type, + WalFileType* log_type = nullptr); +// Same as previous function, but skip info log files. +extern bool ParseFileName(const std::string& filename, uint64_t* number, + FileType* type, WalFileType* log_type = nullptr); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number, + Directory* directory_to_fsync); + +// Make the IDENTITY file for the db +extern Status SetIdentityFile(Env* env, const std::string& dbname); + +// Sync manifest file `file`. +extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, + WritableFileWriter* file); + +} // namespace rocksdb diff --git a/src/rocksdb/util/filter_policy.cc b/src/rocksdb/util/filter_policy.cc new file mode 100644 index 00000000..efb9bf47 --- /dev/null +++ b/src/rocksdb/util/filter_policy.cc @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +namespace rocksdb { + +FilterPolicy::~FilterPolicy() { } + +} // namespace rocksdb diff --git a/src/rocksdb/util/gflags_compat.h b/src/rocksdb/util/gflags_compat.h new file mode 100644 index 00000000..0ea3aef5 --- /dev/null +++ b/src/rocksdb/util/gflags_compat.h @@ -0,0 +1,12 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <gflags/gflags.h> + +#ifndef GFLAGS_NAMESPACE +// in case it's not defined in old versions, that's probably because it was +// still google by default. +#define GFLAGS_NAMESPACE google +#endif diff --git a/src/rocksdb/util/hash.cc b/src/rocksdb/util/hash.cc new file mode 100644 index 00000000..852710d7 --- /dev/null +++ b/src/rocksdb/util/hash.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <string.h> +#include "util/coding.h" +#include "util/hash.h" +#include "util/util.h" + +namespace rocksdb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = static_cast<uint32_t>(seed ^ (n * m)); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + // Note: The original hash implementation used data[i] << shift, which + // promotes the char to int and then performs the shift. If the char is + // negative, the shift is undefined behavior in C++. The hash algorithm is + // part of the format definition, so we cannot change it; to obtain the same + // behavior in a legal way we just cast to uint32_t, which will do + // sign-extension. To guarantee compatibility with architectures where chars + // are unsigned we first cast the char to int8_t. + case 3: + h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16; + FALLTHROUGH_INTENDED; + case 2: + h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8; + FALLTHROUGH_INTENDED; + case 1: + h += static_cast<uint32_t>(static_cast<int8_t>(data[0])); + h *= m; + h ^= (h >> r); + break; + } + return h; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/hash.h b/src/rocksdb/util/hash.h new file mode 100644 index 00000000..ed42b089 --- /dev/null +++ b/src/rocksdb/util/hash.h @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#pragma once +#include <stddef.h> +#include <stdint.h> + +#include "rocksdb/slice.h" +#include "util/murmurhash.h" + +namespace rocksdb { + +// Non-persistent hash. Only used for in-memory data structure +// The hash results are applicable to change. +extern uint64_t NPHash64(const char* data, size_t n, uint32_t seed); + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +inline uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +inline uint64_t GetSliceNPHash64(const Slice& s) { + return NPHash64(s.data(), s.size(), 0); +} + +inline uint32_t GetSliceHash(const Slice& s) { + return Hash(s.data(), s.size(), 397); +} + +inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) { + // Right now murmurhash2B is used. It should able to be freely + // changed to a better hash, without worrying about backward + // compatibility issue. + return MURMUR_HASH(data, static_cast<int>(n), + static_cast<unsigned int>(seed)); +} + +// std::hash compatible interface. +struct SliceHasher { + uint32_t operator()(const Slice& s) const { return GetSliceHash(s); } +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/hash_map.h b/src/rocksdb/util/hash_map.h new file mode 100644 index 00000000..7b08fb39 --- /dev/null +++ b/src/rocksdb/util/hash_map.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include <algorithm> +#include <array> +#include <utility> + +#include "util/autovector.h" + +namespace rocksdb { + +// This is similar to std::unordered_map, except that it tries to avoid +// allocating or deallocating memory as much as possible. With +// std::unordered_map, an allocation/deallocation is made for every insertion +// or deletion because of the requirement that iterators remain valid even +// with insertions or deletions. This means that the hash chains will be +// implemented as linked lists. +// +// This implementation uses autovector as hash chains insteads. +// +template <typename K, typename V, size_t size = 128> +class HashMap { + std::array<autovector<std::pair<K, V>, 1>, size> table_; + + public: + bool Contains(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair<K, V>& p) { return p.first == key; }); + return it != bucket.end(); + } + + void Insert(K key, V value) { + auto& bucket = table_[key % size]; + bucket.push_back({key, value}); + } + + void Delete(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair<K, V>& p) { return p.first == key; }); + if (it != bucket.end()) { + auto last = bucket.end() - 1; + if (it != last) { + *it = *last; + } + bucket.pop_back(); + } + } + + V& Get(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair<K, V>& p) { return p.first == key; }); + return it->second; + } +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/hash_test.cc b/src/rocksdb/util/hash_test.cc new file mode 100644 index 00000000..959e8cd0 --- /dev/null +++ b/src/rocksdb/util/hash_test.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <vector> + +#include "util/hash.h" +#include "util/testharness.h" + +// The hash algorithm is part of the file format, for example for the Bloom +// filters. Test that the hash values are stable for a set of random strings of +// varying lengths. +TEST(HashTest, Values) { + using rocksdb::Hash; + constexpr uint32_t kSeed = 0xbc9f1d34; // Same as BloomHash. + + EXPECT_EQ(Hash("", 0, kSeed), 3164544308); + EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524); + EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998); + EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349); + EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383); + EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956); + EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105); + EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776); + EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603); + EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797); + EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265); + EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661); + EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222); + EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265); + EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611); + EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572); + EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355); + EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370); + EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402); + EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743); + EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500); + EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008); + EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482); + EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422); + EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029); + EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748); + EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698); + EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227); + EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252); + EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402); + EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698); + EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809); + EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120); + EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434); + EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151); + EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696); + EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912); + EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed), + 3420325137); + EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed), + 3427803584); + EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed), + 1152407945); + EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed), + 3382479516); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/heap.h b/src/rocksdb/util/heap.h new file mode 100644 index 00000000..6093c20e --- /dev/null +++ b/src/rocksdb/util/heap.h @@ -0,0 +1,166 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <algorithm> +#include <cstdint> +#include <functional> +#include "port/port.h" +#include "util/autovector.h" + +namespace rocksdb { + +// Binary heap implementation optimized for use in multi-way merge sort. +// Comparison to std::priority_queue: +// - In libstdc++, std::priority_queue::pop() usually performs just over logN +// comparisons but never fewer. +// - std::priority_queue does not have a replace-top operation, requiring a +// pop+push. If the replacement element is the new top, this requires +// around 2logN comparisons. +// - This heap's pop() uses a "schoolbook" downheap which requires up to ~2logN +// comparisons. +// - This heap provides a replace_top() operation which requires [1, 2logN] +// comparisons. When the replacement element is also the new top, this +// takes just 1 or 2 comparisons. +// +// The last property can yield an order-of-magnitude performance improvement +// when merge-sorting real-world non-random data. If the merge operation is +// likely to take chunks of elements from the same input stream, only 1 +// comparison per element is needed. In RocksDB-land, this happens when +// compacting a database where keys are not randomly distributed across L0 +// files but nearby keys are likely to be in the same L0 file. +// +// The container uses the same counterintuitive ordering as +// std::priority_queue: the comparison operator is expected to provide the +// less-than relation, but top() will return the maximum. + +template<typename T, typename Compare = std::less<T>> +class BinaryHeap { + public: + BinaryHeap() { } + explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) { } + + void push(const T& value) { + data_.push_back(value); + upheap(data_.size() - 1); + } + + void push(T&& value) { + data_.push_back(std::move(value)); + upheap(data_.size() - 1); + } + + const T& top() const { + assert(!empty()); + return data_.front(); + } + + void replace_top(const T& value) { + assert(!empty()); + data_.front() = value; + downheap(get_root()); + } + + void replace_top(T&& value) { + assert(!empty()); + data_.front() = std::move(value); + downheap(get_root()); + } + + void pop() { + assert(!empty()); + data_.front() = std::move(data_.back()); + data_.pop_back(); + if (!empty()) { + downheap(get_root()); + } else { + reset_root_cmp_cache(); + } + } + + void swap(BinaryHeap &other) { + std::swap(cmp_, other.cmp_); + data_.swap(other.data_); + std::swap(root_cmp_cache_, other.root_cmp_cache_); + } + + void clear() { + data_.clear(); + reset_root_cmp_cache(); + } + + bool empty() const { return data_.empty(); } + + size_t size() const { return data_.size(); } + + void reset_root_cmp_cache() { root_cmp_cache_ = port::kMaxSizet; } + + private: + static inline size_t get_root() { return 0; } + static inline size_t get_parent(size_t index) { return (index - 1) / 2; } + static inline size_t get_left(size_t index) { return 2 * index + 1; } + static inline size_t get_right(size_t index) { return 2 * index + 2; } + + void upheap(size_t index) { + T v = std::move(data_[index]); + while (index > get_root()) { + const size_t parent = get_parent(index); + if (!cmp_(data_[parent], v)) { + break; + } + data_[index] = std::move(data_[parent]); + index = parent; + } + data_[index] = std::move(v); + reset_root_cmp_cache(); + } + + void downheap(size_t index) { + T v = std::move(data_[index]); + + size_t picked_child = port::kMaxSizet; + while (1) { + const size_t left_child = get_left(index); + if (get_left(index) >= data_.size()) { + break; + } + const size_t right_child = left_child + 1; + assert(right_child == get_right(index)); + picked_child = left_child; + if (index == 0 && root_cmp_cache_ < data_.size()) { + picked_child = root_cmp_cache_; + } else if (right_child < data_.size() && + cmp_(data_[left_child], data_[right_child])) { + picked_child = right_child; + } + if (!cmp_(v, data_[picked_child])) { + break; + } + data_[index] = std::move(data_[picked_child]); + index = picked_child; + } + + if (index == 0) { + // We did not change anything in the tree except for the value + // of the root node, left and right child did not change, we can + // cache that `picked_child` is the smallest child + // so next time we compare againist it directly + root_cmp_cache_ = picked_child; + } else { + // the tree changed, reset cache + reset_root_cmp_cache(); + } + + data_[index] = std::move(v); + } + + Compare cmp_; + autovector<T> data_; + // Used to reduce number of cmp_ calls in downheap() + size_t root_cmp_cache_ = port::kMaxSizet; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/heap_test.cc b/src/rocksdb/util/heap_test.cc new file mode 100644 index 00000000..d036a62e --- /dev/null +++ b/src/rocksdb/util/heap_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <gtest/gtest.h> + +#include <climits> + +#include <queue> +#include <random> +#include <utility> + +#include "util/heap.h" + +#ifndef GFLAGS +const int64_t FLAGS_iters = 100000; +#else +#include "util/gflags_compat.h" +DEFINE_int64(iters, 100000, "number of pseudo-random operations in each test"); +#endif // GFLAGS + +/* + * Compares the custom heap implementation in util/heap.h against + * std::priority_queue on a pseudo-random sequence of operations. + */ + +namespace rocksdb { + +using HeapTestValue = uint64_t; +using Params = std::tuple<size_t, HeapTestValue, int64_t>; + +class HeapTest : public ::testing::TestWithParam<Params> { +}; + +TEST_P(HeapTest, Test) { + // This test performs the same pseudorandom sequence of operations on a + // BinaryHeap and an std::priority_queue, comparing output. The three + // possible operations are insert, replace top and pop. + // + // Insert is chosen slightly more often than the others so that the size of + // the heap slowly grows. Once the size heats the MAX_HEAP_SIZE limit, we + // disallow inserting until the heap becomes empty, testing the "draining" + // scenario. + + const auto MAX_HEAP_SIZE = std::get<0>(GetParam()); + const auto MAX_VALUE = std::get<1>(GetParam()); + const auto RNG_SEED = std::get<2>(GetParam()); + + BinaryHeap<HeapTestValue> heap; + std::priority_queue<HeapTestValue> ref; + + std::mt19937 rng(static_cast<unsigned int>(RNG_SEED)); + std::uniform_int_distribution<HeapTestValue> value_dist(0, MAX_VALUE); + int ndrains = 0; + bool draining = false; // hit max size, draining until we empty the heap + size_t size = 0; + for (int64_t i = 0; i < FLAGS_iters; ++i) { + if (size == 0) { + draining = false; + } + + if (!draining && + (size == 0 || std::bernoulli_distribution(0.4)(rng))) { + // insert + HeapTestValue val = value_dist(rng); + heap.push(val); + ref.push(val); + ++size; + if (size == MAX_HEAP_SIZE) { + draining = true; + ++ndrains; + } + } else if (std::bernoulli_distribution(0.5)(rng)) { + // replace top + HeapTestValue val = value_dist(rng); + heap.replace_top(val); + ref.pop(); + ref.push(val); + } else { + // pop + assert(size > 0); + heap.pop(); + ref.pop(); + --size; + } + + // After every operation, check that the public methods give the same + // results + assert((size == 0) == ref.empty()); + ASSERT_EQ(size == 0, heap.empty()); + if (size > 0) { + ASSERT_EQ(ref.top(), heap.top()); + } + } + + // Probabilities should be set up to occasionally hit the max heap size and + // drain it + assert(ndrains > 0); + + heap.clear(); + ASSERT_TRUE(heap.empty()); +} + +// Basic test, MAX_VALUE = 3*MAX_HEAP_SIZE (occasional duplicates) +INSTANTIATE_TEST_CASE_P( + Basic, HeapTest, + ::testing::Values(Params(1000, 3000, 0x1b575cf05b708945)) +); +// Mid-size heap with small values (many duplicates) +INSTANTIATE_TEST_CASE_P( + SmallValues, HeapTest, + ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0)) +); +// Small heap, large value range (no duplicates) +INSTANTIATE_TEST_CASE_P( + SmallHeap, HeapTest, + ::testing::Values(Params(10, ULLONG_MAX, 0x3e1fa8f4d01707cf)) +); +// Two-element heap +INSTANTIATE_TEST_CASE_P( + TwoElementHeap, HeapTest, + ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc)) +); +// One-element heap +INSTANTIATE_TEST_CASE_P( + OneElementHeap, HeapTest, + ::testing::Values(Params(1, 3, 0x176a1019ab0b612e)) +); + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); +#ifdef GFLAGS + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/jemalloc_nodump_allocator.cc b/src/rocksdb/util/jemalloc_nodump_allocator.cc new file mode 100644 index 00000000..cdd08e93 --- /dev/null +++ b/src/rocksdb/util/jemalloc_nodump_allocator.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/jemalloc_nodump_allocator.h" + +#include <string> +#include <thread> + +#include "port/likely.h" +#include "port/port.h" +#include "util/string_util.h" + +namespace rocksdb { + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr}; + +JemallocNodumpAllocator::JemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index) + : options_(options), + arena_hooks_(std::move(arena_hooks)), + arena_index_(arena_index), + tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {} + +int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) { + // We always enable tcache. The only corner case is when there are a ton of + // threads accessing with low frequency, then it could consume a lot of + // memory (may reach # threads * ~1MB) without bringing too much benefit. + if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound || + size > options_.tcache_size_upper_bound)) { + return MALLOCX_TCACHE_NONE; + } + unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get()); + if (UNLIKELY(tcache_index == nullptr)) { + // Instantiate tcache. + tcache_index = new unsigned(0); + size_t tcache_index_size = sizeof(unsigned); + int ret = + mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0); + if (ret != 0) { + // No good way to expose the error. Silently disable tcache. + delete tcache_index; + return MALLOCX_TCACHE_NONE; + } + tcache_.Reset(static_cast<void*>(tcache_index)); + } + return MALLOCX_TCACHE(*tcache_index); +} + +void* JemallocNodumpAllocator::Allocate(size_t size) { + int tcache_flag = GetThreadSpecificCache(size); + return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag); +} + +void JemallocNodumpAllocator::Deallocate(void* p) { + // Obtain tcache. + size_t size = 0; + if (options_.limit_tcache_size) { + size = malloc_usable_size(p); + } + int tcache_flag = GetThreadSpecificCache(size); + // No need to pass arena index to dallocx(). Jemalloc will find arena index + // from its own metadata. + dallocx(p, tcache_flag); +} + +void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr, + size_t size, size_t alignment, bool* zero, + bool* commit, unsigned arena_ind) { + extent_alloc_t* original_alloc = + original_alloc_.load(std::memory_order_relaxed); + assert(original_alloc != nullptr); + void* result = original_alloc(extent, new_addr, size, alignment, zero, commit, + arena_ind); + if (result != nullptr) { + int ret = madvise(result, size, MADV_DONTDUMP); + if (ret != 0) { + fprintf( + stderr, + "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d", + ret); + assert(false); + } + } + return result; +} + +Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) { + assert(arena_index != 0); + std::string key = "arena." + ToString(arena_index) + ".destroy"; + int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to destroy jemalloc arena, error code: " + + ToString(ret)); + } + return Status::OK(); +} + +void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) { + assert(ptr != nullptr); + unsigned* tcache_index = static_cast<unsigned*>(ptr); + size_t tcache_index_size = sizeof(unsigned); + int ret __attribute__((__unused__)) = + mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size); + // Silently ignore error. + assert(ret == 0); + delete tcache_index; +} + +JemallocNodumpAllocator::~JemallocNodumpAllocator() { + // Destroy tcache before destroying arena. + autovector<void*> tcache_list; + tcache_.Scrape(&tcache_list, nullptr); + for (void* tcache_index : tcache_list) { + DestroyThreadSpecificCache(tcache_index); + } + // Destroy arena. Silently ignore error. + Status s __attribute__((__unused__)) = DestroyArena(arena_index_); + assert(s.ok()); +} + +size_t JemallocNodumpAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return malloc_usable_size(static_cast<void*>(p)); +} +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr<MemoryAllocator>* memory_allocator) { + *memory_allocator = nullptr; + Status unsupported = Status::NotSupported( + "JemallocNodumpAllocator only available with jemalloc version >= 5 " + "and MADV_DONTDUMP is available."); +#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + (void)options; + return unsupported; +#else + if (!HasJemalloc()) { + return unsupported; + } + if (memory_allocator == nullptr) { + return Status::InvalidArgument("memory_allocator must be non-null."); + } + if (options.limit_tcache_size && + options.tcache_size_lower_bound >= options.tcache_size_upper_bound) { + return Status::InvalidArgument( + "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); + } + + // Create arena. + unsigned arena_index = 0; + size_t arena_index_size = sizeof(arena_index); + int ret = + mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to create jemalloc arena, error code: " + + ToString(ret)); + } + assert(arena_index != 0); + + // Read existing hooks. + std::string key = "arena." + ToString(arena_index) + ".extent_hooks"; + extent_hooks_t* hooks; + size_t hooks_size = sizeof(hooks); + ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); + if (ret != 0) { + JemallocNodumpAllocator::DestroyArena(arena_index); + return Status::Incomplete("Failed to read existing hooks, error code: " + + ToString(ret)); + } + + // Store existing alloc. + extent_alloc_t* original_alloc = hooks->alloc; + extent_alloc_t* expected = nullptr; + bool success = + JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( + expected, original_alloc); + if (!success && original_alloc != expected) { + JemallocNodumpAllocator::DestroyArena(arena_index); + return Status::Incomplete("Original alloc conflict."); + } + + // Set the custom hook. + std::unique_ptr<extent_hooks_t> new_hooks(new extent_hooks_t(*hooks)); + new_hooks->alloc = &JemallocNodumpAllocator::Alloc; + extent_hooks_t* hooks_ptr = new_hooks.get(); + ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); + if (ret != 0) { + JemallocNodumpAllocator::DestroyArena(arena_index); + return Status::Incomplete("Failed to set custom hook, error code: " + + ToString(ret)); + } + + // Create cache allocator. + memory_allocator->reset( + new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index)); + return Status::OK(); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/jemalloc_nodump_allocator.h b/src/rocksdb/util/jemalloc_nodump_allocator.h new file mode 100644 index 00000000..e93c1223 --- /dev/null +++ b/src/rocksdb/util/jemalloc_nodump_allocator.h @@ -0,0 +1,79 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <atomic> +#include <vector> + +#include "port/jemalloc_helper.h" +#include "port/port.h" +#include "rocksdb/memory_allocator.h" +#include "util/core_local.h" +#include "util/thread_local.h" + +#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX) + +#include <sys/mman.h> + +#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP) +#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +namespace rocksdb { + +class JemallocNodumpAllocator : public MemoryAllocator { + public: + JemallocNodumpAllocator(JemallocAllocatorOptions& options, + std::unique_ptr<extent_hooks_t>&& arena_hooks, + unsigned arena_index); + ~JemallocNodumpAllocator(); + + const char* Name() const override { return "JemallocNodumpAllocator"; } + void* Allocate(size_t size) override; + void Deallocate(void* p) override; + size_t UsableSize(void* p, size_t allocation_size) const override; + + private: + friend Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr<MemoryAllocator>* memory_allocator); + + // Custom alloc hook to replace jemalloc default alloc. + static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size, + size_t alignment, bool* zero, bool* commit, + unsigned arena_ind); + + // Destroy arena on destruction of the allocator, or on failure. + static Status DestroyArena(unsigned arena_index); + + // Destroy tcache on destruction of the allocator, or thread exit. + static void DestroyThreadSpecificCache(void* ptr); + + // Get or create tcache. Return flag suitable to use with `mallocx`: + // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc). + int GetThreadSpecificCache(size_t size); + + // A function pointer to jemalloc default alloc. Use atomic to make sure + // NewJemallocNodumpAllocator is thread-safe. + // + // Hack: original_alloc_ needs to be static for Alloc() to access it. + // alloc needs to be static to pass to jemalloc as function pointer. + static std::atomic<extent_alloc_t*> original_alloc_; + + const JemallocAllocatorOptions options_; + + // Custom hooks has to outlive corresponding arena. + const std::unique_ptr<extent_hooks_t> arena_hooks_; + + // Arena index. + const unsigned arena_index_; + + // Hold thread-local tcache index. + ThreadLocalPtr tcache_; +}; + +} // namespace rocksdb +#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP +#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX diff --git a/src/rocksdb/util/kv_map.h b/src/rocksdb/util/kv_map.h new file mode 100644 index 00000000..d5ba3307 --- /dev/null +++ b/src/rocksdb/util/kv_map.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <map> +#include <string> + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace rocksdb { +namespace stl_wrappers { + +struct LessOfComparator { + explicit LessOfComparator(const Comparator* c = BytewiseComparator()) + : cmp(c) {} + + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } + bool operator()(const Slice& a, const Slice& b) const { + return cmp->Compare(a, b) < 0; + } + + const Comparator* cmp; +}; + +typedef std::map<std::string, std::string, LessOfComparator> KVMap; +} +} diff --git a/src/rocksdb/util/log_buffer.cc b/src/rocksdb/util/log_buffer.cc new file mode 100644 index 00000000..d09e0cb0 --- /dev/null +++ b/src/rocksdb/util/log_buffer.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/log_buffer.h" + +#include "port/sys_time.h" +#include "port/port.h" + +namespace rocksdb { + +LogBuffer::LogBuffer(const InfoLogLevel log_level, + Logger*info_log) + : log_level_(log_level), info_log_(info_log) {} + +void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format, + va_list ap) { + if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) { + // Skip the level because of its level. + return; + } + + char* alloc_mem = arena_.AllocateAligned(max_log_size); + BufferedLog* buffered_log = new (alloc_mem) BufferedLog(); + char* p = buffered_log->message; + char* limit = alloc_mem + max_log_size - 1; + + // store the time + gettimeofday(&(buffered_log->now_tv), nullptr); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + auto n = vsnprintf(p, limit - p, format, backup_ap); +#ifndef OS_WIN + // MS reports -1 when the buffer is too short + assert(n >= 0); +#endif + if (n > 0) { + p += n; + } else { + p = limit; + } + va_end(backup_ap); + } + + if (p > limit) { + p = limit; + } + + // Add '\0' to the end + *p = '\0'; + + logs_.push_back(buffered_log); +} + +void LogBuffer::FlushBufferToLog() { + for (BufferedLog* log : logs_) { + const time_t seconds = log->now_tv.tv_sec; + struct tm t; + if (localtime_r(&seconds, &t) != nullptr) { + Log(log_level_, info_log_, + "(Original Log Time %04d/%02d/%02d-%02d:%02d:%02d.%06d) %s", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, + t.tm_sec, static_cast<int>(log->now_tv.tv_usec), log->message); + } + } + logs_.clear(); +} + +void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format, + ...) { + if (log_buffer != nullptr) { + va_list ap; + va_start(ap, format); + log_buffer->AddLogToBuffer(max_log_size, format, ap); + va_end(ap); + } +} + +void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) { + const size_t kDefaultMaxLogSize = 512; + if (log_buffer != nullptr) { + va_list ap; + va_start(ap, format); + log_buffer->AddLogToBuffer(kDefaultMaxLogSize, format, ap); + va_end(ap); + } +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/log_buffer.h b/src/rocksdb/util/log_buffer.h new file mode 100644 index 00000000..e356b93a --- /dev/null +++ b/src/rocksdb/util/log_buffer.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" +#include "util/arena.h" +#include "util/autovector.h" +#include "port/sys_time.h" +#include <ctime> + +namespace rocksdb { + +class Logger; + +// A class to buffer info log entries and flush them in the end. +class LogBuffer { + public: + // log_level: the log level for all the logs + // info_log: logger to write the logs to + LogBuffer(const InfoLogLevel log_level, Logger* info_log); + + // Add a log entry to the buffer. Use default max_log_size. + // max_log_size indicates maximize log size, including some metadata. + void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap); + + size_t IsEmpty() const { return logs_.empty(); } + + // Flush all buffered log to the info log. + void FlushBufferToLog(); + + private: + // One log entry with its timestamp + struct BufferedLog { + struct timeval now_tv; // Timestamp of the log + char message[1]; // Beginning of log message + }; + + const InfoLogLevel log_level_; + Logger* info_log_; + Arena arena_; + autovector<BufferedLog*> logs_; +}; + +// Add log to the LogBuffer for a delayed info logging. It can be used when +// we want to add some logs inside a mutex. +// max_log_size indicates maximize log size, including some metadata. +extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, + const char* format, ...); +// Same as previous function, but with default max log size. +extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); + +} // namespace rocksdb diff --git a/src/rocksdb/util/log_write_bench.cc b/src/rocksdb/util/log_write_bench.cc new file mode 100644 index 00000000..5c9b3e84 --- /dev/null +++ b/src/rocksdb/util/log_write_bench.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include "monitoring/histogram.h" +#include "rocksdb/env.h" +#include "util/file_reader_writer.h" +#include "util/gflags_compat.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::SetUsageMessage; + +// A simple benchmark to simulate transactional logs + +DEFINE_int32(num_records, 6000, "Number of records."); +DEFINE_int32(record_size, 249, "Size of each record."); +DEFINE_int32(record_interval, 10000, "Interval between records (microSec)"); +DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions"); +DEFINE_bool(enable_sync, false, "sync after each write."); + +namespace rocksdb { +void RunBenchmark() { + std::string file_name = test::PerThreadDBPath("log_write_benchmark.log"); + Env* env = Env::Default(); + EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions()); + env_options.bytes_per_sync = FLAGS_bytes_per_sync; + std::unique_ptr<WritableFile> file; + env->NewWritableFile(file_name, &file, env_options); + std::unique_ptr<WritableFileWriter> writer; + writer.reset(new WritableFileWriter(std::move(file), env_options)); + + std::string record; + record.assign(FLAGS_record_size, 'X'); + + HistogramImpl hist; + + uint64_t start_time = env->NowMicros(); + for (int i = 0; i < FLAGS_num_records; i++) { + uint64_t start_nanos = env->NowNanos(); + writer->Append(record); + writer->Flush(); + if (FLAGS_enable_sync) { + writer->Sync(false); + } + hist.Add(env->NowNanos() - start_nanos); + + if (i % 1000 == 1) { + fprintf(stderr, "Wrote %d records...\n", i); + } + + int time_to_sleep = + (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time); + if (time_to_sleep > 0) { + env->SleepForMicroseconds(time_to_sleep); + } + } + + fprintf(stderr, "Distribution of latency of append+flush: \n%s", + hist.ToString().c_str()); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + rocksdb::RunBenchmark(); + return 0; +} + +#endif // GFLAGS diff --git a/src/rocksdb/util/logging.h b/src/rocksdb/util/logging.h new file mode 100644 index 00000000..a4ef31bd --- /dev/null +++ b/src/rocksdb/util/logging.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#pragma once + +// Helper macros that include information about file name and line number +#define ROCKS_LOG_STRINGIFY(x) #x +#define ROCKS_LOG_TOSTRING(x) ROCKS_LOG_STRINGIFY(x) +#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT) + +inline const char* RocksLogShorterFileName(const char* file) +{ + // 15 is the length of "util/logging.h". + // If the name of this file changed, please change this number, too. + return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0); +} + +// Don't inclide file/line info in HEADER level +#define ROCKS_LOG_HEADER(LGR, FMT, ...) \ + rocksdb::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_DEBUG(LGR, FMT, ...) \ + rocksdb::Log(InfoLogLevel::DEBUG_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_INFO(LGR, FMT, ...) \ + rocksdb::Log(InfoLogLevel::INFO_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_WARN(LGR, FMT, ...) \ + rocksdb::Log(InfoLogLevel::WARN_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_ERROR(LGR, FMT, ...) \ + rocksdb::Log(InfoLogLevel::ERROR_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_FATAL(LGR, FMT, ...) \ + rocksdb::Log(InfoLogLevel::FATAL_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...) \ + rocksdb::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...) \ + rocksdb::LogToBuffer(LOG_BUF, MAX_LOG_SIZE, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ + RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) + +#define ROCKS_LOG_DETAILS(LGR, FMT, ...) \ + ; // due to overhead by default skip such lines +// ROCKS_LOG_DEBUG(LGR, FMT, ##__VA_ARGS__) diff --git a/src/rocksdb/util/memory_allocator.h b/src/rocksdb/util/memory_allocator.h new file mode 100644 index 00000000..99a7241d --- /dev/null +++ b/src/rocksdb/util/memory_allocator.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include "rocksdb/memory_allocator.h" + +namespace rocksdb { + +struct CustomDeleter { + CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {} + + void operator()(char* ptr) const { + if (allocator) { + allocator->Deallocate(reinterpret_cast<void*>(ptr)); + } else { + delete[] ptr; + } + } + + MemoryAllocator* allocator; +}; + +using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>; + +inline CacheAllocationPtr AllocateBlock(size_t size, + MemoryAllocator* allocator) { + if (allocator) { + auto block = reinterpret_cast<char*>(allocator->Allocate(size)); + return CacheAllocationPtr(block, allocator); + } + return CacheAllocationPtr(new char[size]); +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/memory_usage.h b/src/rocksdb/util/memory_usage.h new file mode 100644 index 00000000..0d885445 --- /dev/null +++ b/src/rocksdb/util/memory_usage.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <unordered_map> + +namespace rocksdb { + +// Helper methods to estimate memroy usage by std containers. + +template <class Key, class Value, class Hash> +size_t ApproximateMemoryUsage( + const std::unordered_map<Key, Value, Hash>& umap) { + typedef std::unordered_map<Key, Value, Hash> Map; + return sizeof(umap) + + // Size of all items plus a next pointer for each item. + (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() + + // Size of hash buckets. + umap.bucket_count() * sizeof(void*); +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/mock_time_env.h b/src/rocksdb/util/mock_time_env.h new file mode 100644 index 00000000..feada477 --- /dev/null +++ b/src/rocksdb/util/mock_time_env.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" + +namespace rocksdb { + +class MockTimeEnv : public EnvWrapper { + public: + explicit MockTimeEnv(Env* base) : EnvWrapper(base) {} + + virtual Status GetCurrentTime(int64_t* time) override { + assert(time != nullptr); + assert(current_time_ <= + static_cast<uint64_t>(std::numeric_limits<int64_t>::max())); + *time = static_cast<int64_t>(current_time_); + return Status::OK(); + } + + virtual uint64_t NowMicros() override { + assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000); + return current_time_ * 1000000; + } + + virtual uint64_t NowNanos() override { + assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000); + return current_time_ * 1000000000; + } + + uint64_t RealNowMicros() { return target()->NowMicros(); } + + void set_current_time(uint64_t time) { + assert(time >= current_time_); + current_time_ = time; + } + + private: + std::atomic<uint64_t> current_time_{0}; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/murmurhash.cc b/src/rocksdb/util/murmurhash.cc new file mode 100644 index 00000000..3b759c5e --- /dev/null +++ b/src/rocksdb/util/murmurhash.cc @@ -0,0 +1,191 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash + is under the MIT license. +*/ +#include "murmurhash.h" +#include "util/util.h" + +#if defined(__x86_64__) + +// ------------------------------------------------------------------- +// +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. +// +// 64-bit hash for 64-bit platforms + +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= ((uint64_t)data2[6]) << 48; FALLTHROUGH_INTENDED; + case 6: h ^= ((uint64_t)data2[5]) << 40; FALLTHROUGH_INTENDED; + case 5: h ^= ((uint64_t)data2[4]) << 32; FALLTHROUGH_INTENDED; + case 4: h ^= ((uint64_t)data2[3]) << 24; FALLTHROUGH_INTENDED; + case 3: h ^= ((uint64_t)data2[2]) << 16; FALLTHROUGH_INTENDED; + case 2: h ^= ((uint64_t)data2[1]) << 8; FALLTHROUGH_INTENDED; + case 1: h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +#elif defined(__i386__) + +// ------------------------------------------------------------------- +// +// Note - This code makes a few assumptions about how your machine behaves - +// +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED; + case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#else + +// ------------------------------------------------------------------- +// +// Same as MurmurHash2, but endian- and alignment-neutral. +// Half the speed though, alas. + +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h = seed ^ len; + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k; + + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + switch(len) + { + case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED; + case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED; + case 1: h ^= data[0]; + h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#endif diff --git a/src/rocksdb/util/murmurhash.h b/src/rocksdb/util/murmurhash.h new file mode 100644 index 00000000..cbfc4068 --- /dev/null +++ b/src/rocksdb/util/murmurhash.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash + is under the MIT license. +*/ +#pragma once +#include <stdint.h> +#include "rocksdb/slice.h" + +#if defined(__x86_64__) +#define MURMUR_HASH MurmurHash64A +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash64A +typedef uint64_t murmur_t; + +#elif defined(__i386__) +#define MURMUR_HASH MurmurHash2 +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash2 +typedef unsigned int murmur_t; + +#else +#define MURMUR_HASH MurmurHashNeutral2 +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHashNeutral2 +typedef unsigned int murmur_t; +#endif + +// Allow slice to be hashable by murmur hash. +namespace rocksdb { +struct murmur_hash { + size_t operator()(const Slice& slice) const { + return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0); + } +}; +} // rocksdb diff --git a/src/rocksdb/util/mutexlock.h b/src/rocksdb/util/mutexlock.h new file mode 100644 index 00000000..640cef3d --- /dev/null +++ b/src/rocksdb/util/mutexlock.h @@ -0,0 +1,131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <assert.h> +#include <atomic> +#include <mutex> +#include <thread> +#include "port/port.h" + +namespace rocksdb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +// +// Acquire a ReadLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class ReadLock { + public: + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->ReadLock(); + } + ~ReadLock() { this->mu_->ReadUnlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + ReadLock(const ReadLock&); + void operator=(const ReadLock&); +}; + +// +// Automatically unlock a locked mutex when the object is destroyed +// +class ReadUnlock { + public: + explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); } + ~ReadUnlock() { mu_->ReadUnlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + ReadUnlock(const ReadUnlock &) = delete; + ReadUnlock &operator=(const ReadUnlock &) = delete; +}; + +// +// Acquire a WriteLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class WriteLock { + public: + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->WriteLock(); + } + ~WriteLock() { this->mu_->WriteUnlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + WriteLock(const WriteLock&); + void operator=(const WriteLock&); +}; + +// +// SpinMutex has very low overhead for low-contention cases. Method names +// are chosen so you can use std::unique_lock or std::lock_guard with it. +// +class SpinMutex { + public: + SpinMutex() : locked_(false) {} + + bool try_lock() { + auto currently_locked = locked_.load(std::memory_order_relaxed); + return !currently_locked && + locked_.compare_exchange_weak(currently_locked, true, + std::memory_order_acquire, + std::memory_order_relaxed); + } + + void lock() { + for (size_t tries = 0;; ++tries) { + if (try_lock()) { + // success + break; + } + port::AsmVolatilePause(); + if (tries > 100) { + std::this_thread::yield(); + } + } + } + + void unlock() { locked_.store(false, std::memory_order_release); } + + private: + std::atomic<bool> locked_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/ppc-opcode.h b/src/rocksdb/util/ppc-opcode.h new file mode 100644 index 00000000..554fa50a --- /dev/null +++ b/src/rocksdb/util/ppc-opcode.h @@ -0,0 +1,28 @@ +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// This source code is also licensed under the GPLv2 license found in the +// COPYING file in the root directory of this source tree. + +#pragma once + +#define __PPC_RA(a) (((a)&0x1f) << 16) +#define __PPC_RB(b) (((b)&0x1f) << 11) +#define __PPC_XA(a) ((((a)&0x1f) << 16) | (((a)&0x20) >> 3)) +#define __PPC_XB(b) ((((b)&0x1f) << 11) | (((b)&0x20) >> 4)) +#define __PPC_XS(s) ((((s)&0x1f) << 21) | (((s)&0x20) >> 5)) +#define __PPC_XT(s) __PPC_XS(s) +#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b)) +#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b)) + +#define PPC_INST_VPMSUMW 0x10000488 +#define PPC_INST_VPMSUMD 0x100004c8 +#define PPC_INST_MFVSRD 0x7c000066 +#define PPC_INST_MTVSRD 0x7c000166 + +#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b) +#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b) +#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0) +#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0) diff --git a/src/rocksdb/util/random.cc b/src/rocksdb/util/random.cc new file mode 100644 index 00000000..5e2cf626 --- /dev/null +++ b/src/rocksdb/util/random.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "util/random.h" + +#include <stdint.h> +#include <string.h> +#include <thread> +#include <utility> + +#include "port/likely.h" +#include "util/thread_local.h" + +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +#define STORAGE_DECL static __thread +#else +#define STORAGE_DECL static +#endif + +namespace rocksdb { + +Random* Random::GetTLSInstance() { + STORAGE_DECL Random* tls_instance; + STORAGE_DECL std::aligned_storage<sizeof(Random)>::type tls_instance_bytes; + + auto rv = tls_instance; + if (UNLIKELY(rv == nullptr)) { + size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id()); + rv = new (&tls_instance_bytes) Random((uint32_t)seed); + tls_instance = rv; + } + return rv; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/random.h b/src/rocksdb/util/random.h new file mode 100644 index 00000000..2a5fcbc6 --- /dev/null +++ b/src/rocksdb/util/random.h @@ -0,0 +1,109 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <random> +#include <stdint.h> + +namespace rocksdb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + enum : uint32_t { + M = 2147483647L // 2^31-1 + }; + enum : uint64_t { + A = 16807 // bits 14, 8, 7, 5, 2, 1, 0 + }; + + uint32_t seed_; + + static uint32_t GoodSeed(uint32_t s) { return (s & M) != 0 ? (s & M) : 1; } + + public: + // This is the largest value that can be returned from Next() + enum : uint32_t { kMaxNext = M }; + + explicit Random(uint32_t s) : seed_(GoodSeed(s)) {} + + void Reset(uint32_t s) { seed_ = GoodSeed(s); } + + uint32_t Next() { + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast<uint32_t>((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } + + // Returns a Random instance for use by the current thread without + // additional locking + static Random* GetTLSInstance(); +}; + +// A simple 64bit random number generator based on std::mt19937_64 +class Random64 { + private: + std::mt19937_64 generator_; + + public: + explicit Random64(uint64_t s) : generator_(s) { } + + // Generates the next random number + uint64_t Next() { return generator_(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform(uint64_t n) { + return std::uniform_int_distribution<uint64_t>(0, n - 1)(generator_); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint64_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint64_t Skewed(int max_log) { + return Uniform(uint64_t(1) << Uniform(max_log + 1)); + } +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/rate_limiter.cc b/src/rocksdb/util/rate_limiter.cc new file mode 100644 index 00000000..9d23c38f --- /dev/null +++ b/src/rocksdb/util/rate_limiter.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/rate_limiter.h" +#include "monitoring/statistics.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "util/aligned_buffer.h" +#include "util/sync_point.h" + +namespace rocksdb { + +size_t RateLimiter::RequestToken(size_t bytes, size_t alignment, + Env::IOPriority io_priority, Statistics* stats, + RateLimiter::OpType op_type) { + if (io_priority < Env::IO_TOTAL && IsRateLimited(op_type)) { + bytes = std::min(bytes, static_cast<size_t>(GetSingleBurstBytes())); + + if (alignment > 0) { + // Here we may actually require more than burst and block + // but we can not write less than one page at a time on direct I/O + // thus we may want not to use ratelimiter + bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes)); + } + Request(bytes, io_priority, stats, op_type); + } + return bytes; +} + +// Pending request +struct GenericRateLimiter::Req { + explicit Req(int64_t _bytes, port::Mutex* _mu) + : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {} + int64_t request_bytes; + int64_t bytes; + port::CondVar cv; + bool granted; +}; + +GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec, + int64_t refill_period_us, + int32_t fairness, RateLimiter::Mode mode, + Env* env, bool auto_tuned) + : RateLimiter(mode), + refill_period_us_(refill_period_us), + rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2 + : rate_bytes_per_sec), + refill_bytes_per_period_( + CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)), + env_(env), + stop_(false), + exit_cv_(&request_mutex_), + requests_to_wait_(0), + available_bytes_(0), + next_refill_us_(NowMicrosMonotonic(env_)), + fairness_(fairness > 100 ? 100 : fairness), + rnd_((uint32_t)time(nullptr)), + leader_(nullptr), + auto_tuned_(auto_tuned), + num_drains_(0), + prev_num_drains_(0), + max_bytes_per_sec_(rate_bytes_per_sec), + tuned_time_(NowMicrosMonotonic(env_)) { + total_requests_[0] = 0; + total_requests_[1] = 0; + total_bytes_through_[0] = 0; + total_bytes_through_[1] = 0; +} + +GenericRateLimiter::~GenericRateLimiter() { + MutexLock g(&request_mutex_); + stop_ = true; + requests_to_wait_ = static_cast<int32_t>(queue_[Env::IO_LOW].size() + + queue_[Env::IO_HIGH].size()); + for (auto& r : queue_[Env::IO_HIGH]) { + r->cv.Signal(); + } + for (auto& r : queue_[Env::IO_LOW]) { + r->cv.Signal(); + } + while (requests_to_wait_ > 0) { + exit_cv_.Wait(); + } +} + +// This API allows user to dynamically change rate limiter's bytes per second. +void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) { + assert(bytes_per_second > 0); + rate_bytes_per_sec_ = bytes_per_second; + refill_bytes_per_period_.store( + CalculateRefillBytesPerPeriod(bytes_per_second), + std::memory_order_relaxed); +} + +void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, + Statistics* stats) { + assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed)); + TEST_SYNC_POINT("GenericRateLimiter::Request"); + TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1", + &rate_bytes_per_sec_); + MutexLock g(&request_mutex_); + + if (auto_tuned_) { + static const int kRefillsPerTune = 100; + std::chrono::microseconds now(NowMicrosMonotonic(env_)); + if (now - tuned_time_ >= + kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) { + Tune(); + } + } + + if (stop_) { + return; + } + + ++total_requests_[pri]; + + if (available_bytes_ >= bytes) { + // Refill thread assigns quota and notifies requests waiting on + // the queue under mutex. So if we get here, that means nobody + // is waiting? + available_bytes_ -= bytes; + total_bytes_through_[pri] += bytes; + return; + } + + // Request cannot be satisfied at this moment, enqueue + Req r(bytes, &request_mutex_); + queue_[pri].push_back(&r); + + do { + bool timedout = false; + // Leader election, candidates can be: + // (1) a new incoming request, + // (2) a previous leader, whose quota has not been not assigned yet due + // to lower priority + // (3) a previous waiter at the front of queue, who got notified by + // previous leader + if (leader_ == nullptr && + ((!queue_[Env::IO_HIGH].empty() && + &r == queue_[Env::IO_HIGH].front()) || + (!queue_[Env::IO_LOW].empty() && + &r == queue_[Env::IO_LOW].front()))) { + leader_ = &r; + int64_t delta = next_refill_us_ - NowMicrosMonotonic(env_); + delta = delta > 0 ? delta : 0; + if (delta == 0) { + timedout = true; + } else { + int64_t wait_until = env_->NowMicros() + delta; + RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS); + ++num_drains_; + timedout = r.cv.TimedWait(wait_until); + } + } else { + // Not at the front of queue or an leader has already been elected + r.cv.Wait(); + } + + // request_mutex_ is held from now on + if (stop_) { + --requests_to_wait_; + exit_cv_.Signal(); + return; + } + + // Make sure the waken up request is always the header of its queue + assert(r.granted || + (!queue_[Env::IO_HIGH].empty() && + &r == queue_[Env::IO_HIGH].front()) || + (!queue_[Env::IO_LOW].empty() && + &r == queue_[Env::IO_LOW].front())); + assert(leader_ == nullptr || + (!queue_[Env::IO_HIGH].empty() && + leader_ == queue_[Env::IO_HIGH].front()) || + (!queue_[Env::IO_LOW].empty() && + leader_ == queue_[Env::IO_LOW].front())); + + if (leader_ == &r) { + // Waken up from TimedWait() + if (timedout) { + // Time to do refill! + Refill(); + + // Re-elect a new leader regardless. This is to simplify the + // election handling. + leader_ = nullptr; + + // Notify the header of queue if current leader is going away + if (r.granted) { + // Current leader already got granted with quota. Notify header + // of waiting queue to participate next round of election. + assert((queue_[Env::IO_HIGH].empty() || + &r != queue_[Env::IO_HIGH].front()) && + (queue_[Env::IO_LOW].empty() || + &r != queue_[Env::IO_LOW].front())); + if (!queue_[Env::IO_HIGH].empty()) { + queue_[Env::IO_HIGH].front()->cv.Signal(); + } else if (!queue_[Env::IO_LOW].empty()) { + queue_[Env::IO_LOW].front()->cv.Signal(); + } + // Done + break; + } + } else { + // Spontaneous wake up, need to continue to wait + assert(!r.granted); + leader_ = nullptr; + } + } else { + // Waken up by previous leader: + // (1) if requested quota is granted, it is done. + // (2) if requested quota is not granted, this means current thread + // was picked as a new leader candidate (previous leader got quota). + // It needs to participate leader election because a new request may + // come in before this thread gets waken up. So it may actually need + // to do Wait() again. + assert(!timedout); + } + } while (!r.granted); +} + +void GenericRateLimiter::Refill() { + TEST_SYNC_POINT("GenericRateLimiter::Refill"); + next_refill_us_ = NowMicrosMonotonic(env_) + refill_period_us_; + // Carry over the left over quota from the last period + auto refill_bytes_per_period = + refill_bytes_per_period_.load(std::memory_order_relaxed); + if (available_bytes_ < refill_bytes_per_period) { + available_bytes_ += refill_bytes_per_period; + } + + int use_low_pri_first = rnd_.OneIn(fairness_) ? 0 : 1; + for (int q = 0; q < 2; ++q) { + auto use_pri = (use_low_pri_first == q) ? Env::IO_LOW : Env::IO_HIGH; + auto* queue = &queue_[use_pri]; + while (!queue->empty()) { + auto* next_req = queue->front(); + if (available_bytes_ < next_req->request_bytes) { + // avoid starvation + next_req->request_bytes -= available_bytes_; + available_bytes_ = 0; + break; + } + available_bytes_ -= next_req->request_bytes; + next_req->request_bytes = 0; + total_bytes_through_[use_pri] += next_req->bytes; + queue->pop_front(); + + next_req->granted = true; + if (next_req != leader_) { + // Quota granted, signal the thread + next_req->cv.Signal(); + } + } + } +} + +int64_t GenericRateLimiter::CalculateRefillBytesPerPeriod( + int64_t rate_bytes_per_sec) { + if (port::kMaxInt64 / rate_bytes_per_sec < refill_period_us_) { + // Avoid unexpected result in the overflow case. The result now is still + // inaccurate but is a number that is large enough. + return port::kMaxInt64 / 1000000; + } else { + return std::max(kMinRefillBytesPerPeriod, + rate_bytes_per_sec * refill_period_us_ / 1000000); + } +} + +Status GenericRateLimiter::Tune() { + const int kLowWatermarkPct = 50; + const int kHighWatermarkPct = 90; + const int kAdjustFactorPct = 5; + // computed rate limit will be in + // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`. + const int kAllowedRangeFactor = 20; + + std::chrono::microseconds prev_tuned_time = tuned_time_; + tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_)); + + int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time + + std::chrono::microseconds(refill_period_us_) - + std::chrono::microseconds(1)) / + std::chrono::microseconds(refill_period_us_); + // We tune every kRefillsPerTune intervals, so the overflow and division-by- + // zero conditions should never happen. + assert(num_drains_ - prev_num_drains_ <= port::kMaxInt64 / 100); + assert(elapsed_intervals > 0); + int64_t drained_pct = + (num_drains_ - prev_num_drains_) * 100 / elapsed_intervals; + + int64_t prev_bytes_per_sec = GetBytesPerSecond(); + int64_t new_bytes_per_sec; + if (drained_pct == 0) { + new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor; + } else if (drained_pct < kLowWatermarkPct) { + // sanitize to prevent overflow + int64_t sanitized_prev_bytes_per_sec = + std::min(prev_bytes_per_sec, port::kMaxInt64 / 100); + new_bytes_per_sec = + std::max(max_bytes_per_sec_ / kAllowedRangeFactor, + sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct)); + } else if (drained_pct > kHighWatermarkPct) { + // sanitize to prevent overflow + int64_t sanitized_prev_bytes_per_sec = std::min( + prev_bytes_per_sec, port::kMaxInt64 / (100 + kAdjustFactorPct)); + new_bytes_per_sec = + std::min(max_bytes_per_sec_, + sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100); + } else { + new_bytes_per_sec = prev_bytes_per_sec; + } + if (new_bytes_per_sec != prev_bytes_per_sec) { + SetBytesPerSecond(new_bytes_per_sec); + } + num_drains_ = prev_num_drains_; + return Status::OK(); +} + +RateLimiter* NewGenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */, + int32_t fairness /* = 10 */, + RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */, + bool auto_tuned /* = false */) { + assert(rate_bytes_per_sec > 0); + assert(refill_period_us > 0); + assert(fairness > 0); + return new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness, + mode, Env::Default(), auto_tuned); +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/rate_limiter.h b/src/rocksdb/util/rate_limiter.h new file mode 100644 index 00000000..cb91f0ae --- /dev/null +++ b/src/rocksdb/util/rate_limiter.h @@ -0,0 +1,113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <algorithm> +#include <atomic> +#include <chrono> +#include <deque> +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/rate_limiter.h" +#include "util/mutexlock.h" +#include "util/random.h" + +namespace rocksdb { + +class GenericRateLimiter : public RateLimiter { + public: + GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us, + int32_t fairness, RateLimiter::Mode mode, Env* env, + bool auto_tuned); + + virtual ~GenericRateLimiter(); + + // This API allows user to dynamically change rate limiter's bytes per second. + virtual void SetBytesPerSecond(int64_t bytes_per_second) override; + + // Request for token to write bytes. If this request can not be satisfied, + // the call is blocked. Caller is responsible to make sure + // bytes <= GetSingleBurstBytes() + using RateLimiter::Request; + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats) override; + + virtual int64_t GetSingleBurstBytes() const override { + return refill_bytes_per_period_.load(std::memory_order_relaxed); + } + + virtual int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + return total_bytes_through_[Env::IO_LOW] + + total_bytes_through_[Env::IO_HIGH]; + } + return total_bytes_through_[pri]; + } + + virtual int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + return total_requests_[Env::IO_LOW] + total_requests_[Env::IO_HIGH]; + } + return total_requests_[pri]; + } + + virtual int64_t GetBytesPerSecond() const override { + return rate_bytes_per_sec_; + } + + private: + void Refill(); + int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec); + Status Tune(); + + uint64_t NowMicrosMonotonic(Env* env) { + return env->NowNanos() / std::milli::den; + } + + // This mutex guard all internal states + mutable port::Mutex request_mutex_; + + const int64_t kMinRefillBytesPerPeriod = 100; + + const int64_t refill_period_us_; + + int64_t rate_bytes_per_sec_; + // This variable can be changed dynamically. + std::atomic<int64_t> refill_bytes_per_period_; + Env* const env_; + + bool stop_; + port::CondVar exit_cv_; + int32_t requests_to_wait_; + + int64_t total_requests_[Env::IO_TOTAL]; + int64_t total_bytes_through_[Env::IO_TOTAL]; + int64_t available_bytes_; + int64_t next_refill_us_; + + int32_t fairness_; + Random rnd_; + + struct Req; + Req* leader_; + std::deque<Req*> queue_[Env::IO_TOTAL]; + + bool auto_tuned_; + int64_t num_drains_; + int64_t prev_num_drains_; + const int64_t max_bytes_per_sec_; + std::chrono::microseconds tuned_time_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/rate_limiter_test.cc b/src/rocksdb/util/rate_limiter_test.cc new file mode 100644 index 00000000..d3f3be3b --- /dev/null +++ b/src/rocksdb/util/rate_limiter_test.cc @@ -0,0 +1,239 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "util/rate_limiter.h" + +#include <inttypes.h> +#include <chrono> +#include <limits> + +#include "db/db_test_util.h" +#include "rocksdb/env.h" +#include "util/random.h" +#include "util/sync_point.h" +#include "util/testharness.h" + +namespace rocksdb { + +// TODO(yhchiang): the rate will not be accurate when we run test in parallel. +class RateLimiterTest : public testing::Test {}; + +TEST_F(RateLimiterTest, OverflowRate) { + GenericRateLimiter limiter(port::kMaxInt64, 1000, 10, + RateLimiter::Mode::kWritesOnly, Env::Default(), + false /* auto_tuned */); + ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll); +} + +TEST_F(RateLimiterTest, StartStop) { + std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(100, 100, 10)); +} + +TEST_F(RateLimiterTest, Modes) { + for (auto mode : {RateLimiter::Mode::kWritesOnly, + RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) { + GenericRateLimiter limiter( + 2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */, mode, Env::Default(), false /* auto_tuned */); + limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kRead); + if (mode == RateLimiter::Mode::kWritesOnly) { + ASSERT_EQ(0, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } else { + ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } + + limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + if (mode == RateLimiter::Mode::kAllIo) { + ASSERT_EQ(2000, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } else { + ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } + } +} + +#if !(defined(TRAVIS) && defined(OS_MACOSX)) +TEST_F(RateLimiterTest, Rate) { + auto* env = Env::Default(); + struct Arg { + Arg(int32_t _target_rate, int _burst) + : limiter(NewGenericRateLimiter(_target_rate, 100 * 1000, 10)), + request_size(_target_rate / 10), + burst(_burst) {} + std::unique_ptr<RateLimiter> limiter; + int32_t request_size; + int burst; + }; + + auto writer = [](void* p) { + auto* thread_env = Env::Default(); + auto* arg = static_cast<Arg*>(p); + // Test for 2 seconds + auto until = thread_env->NowMicros() + 2 * 1000000; + Random r((uint32_t)(thread_env->NowNanos() % + std::numeric_limits<uint32_t>::max())); + while (thread_env->NowMicros() < until) { + for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) { + arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, + Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW, + nullptr /* stats */, RateLimiter::OpType::kWrite); + } + }; + + for (int i = 1; i <= 16; i *= 2) { + int32_t target = i * 1024 * 10; + Arg arg(target, i / 4 + 1); + int64_t old_total_bytes_through = 0; + for (int iter = 1; iter <= 2; ++iter) { + // second iteration changes the target dynamically + if (iter == 2) { + target *= 2; + arg.limiter->SetBytesPerSecond(target); + } + auto start = env->NowMicros(); + for (int t = 0; t < i; ++t) { + env->StartThread(writer, &arg); + } + env->WaitForJoin(); + + auto elapsed = env->NowMicros() - start; + double rate = + (arg.limiter->GetTotalBytesThrough() - old_total_bytes_through) * + 1000000.0 / elapsed; + old_total_bytes_through = arg.limiter->GetTotalBytesThrough(); + fprintf(stderr, + "request size [1 - %" PRIi32 "], limit %" PRIi32 + " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n", + arg.request_size - 1, target / 1024, rate / 1024, + elapsed / 1000000.0); + + ASSERT_GE(rate / target, 0.80); + ASSERT_LE(rate / target, 1.25); + } + } +} +#endif + +TEST_F(RateLimiterTest, LimitChangeTest) { + // starvation test when limit changes to a smaller value + int64_t refill_period = 1000 * 1000; + auto* env = Env::Default(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + struct Arg { + Arg(int32_t _request_size, Env::IOPriority _pri, + std::shared_ptr<RateLimiter> _limiter) + : request_size(_request_size), pri(_pri), limiter(_limiter) {} + int32_t request_size; + Env::IOPriority pri; + std::shared_ptr<RateLimiter> limiter; + }; + + auto writer = [](void* p) { + auto* arg = static_cast<Arg*>(p); + arg->limiter->Request(arg->request_size, arg->pri, nullptr /* stats */, + RateLimiter::OpType::kWrite); + }; + + for (uint32_t i = 1; i <= 16; i <<= 1) { + int32_t target = i * 1024 * 10; + // refill per second + for (int iter = 0; iter < 2; iter++) { + std::shared_ptr<RateLimiter> limiter = + std::make_shared<GenericRateLimiter>( + target, refill_period, 10, RateLimiter::Mode::kWritesOnly, + Env::Default(), false /* auto_tuned */); + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"GenericRateLimiter::Request", + "RateLimiterTest::LimitChangeTest:changeLimitStart"}, + {"RateLimiterTest::LimitChangeTest:changeLimitEnd", + "GenericRateLimiter::Refill"}}); + Arg arg(target, Env::IO_HIGH, limiter); + // The idea behind is to start a request first, then before it refills, + // update limit to a different value (2X/0.5X). No starvation should + // be guaranteed under any situation + // TODO(lightmark): more test cases are welcome. + env->StartThread(writer, &arg); + int32_t new_limit = (target << 1) >> (iter << 1); + TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitStart"); + arg.limiter->SetBytesPerSecond(new_limit); + TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitEnd"); + env->WaitForJoin(); + fprintf(stderr, + "[COMPLETE] request size %" PRIi32 " KB, new limit %" PRIi32 + "KB/sec, refill period %" PRIi64 " ms\n", + target / 1024, new_limit / 1024, refill_period / 1000); + } + } +} + +TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) { + const std::chrono::seconds kTimePerRefill(1); + const int kRefillsPerTune = 100; // needs to match util/rate_limiter.cc + + SpecialEnv special_env(Env::Default()); + special_env.no_slowdown_ = true; + special_env.time_elapse_only_sleep_ = true; + + auto stats = CreateDBStatistics(); + std::unique_ptr<RateLimiter> rate_limiter(new GenericRateLimiter( + 1000 /* rate_bytes_per_sec */, + std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */, + RateLimiter::Mode::kWritesOnly, &special_env, true /* auto_tuned */)); + + // Use callback to advance time because we need to advance (1) after Request() + // has determined the bytes are not available; and (2) before Refill() + // computes the next refill time (ensuring refill time in the future allows + // the next request to drain the rate limiter). + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Refill", [&](void* /*arg*/) { + special_env.SleepForMicroseconds(static_cast<int>( + std::chrono::microseconds(kTimePerRefill).count())); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // verify rate limit increases after a sequence of periods where rate limiter + // is always drained + int64_t orig_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); + rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(), + RateLimiter::OpType::kWrite); + while (std::chrono::microseconds(special_env.NowMicros()) <= + kRefillsPerTune * kTimePerRefill) { + rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(), + RateLimiter::OpType::kWrite); + } + int64_t new_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); + ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + // decreases after a sequence of periods where rate limiter is not drained + orig_bytes_per_sec = new_bytes_per_sec; + special_env.SleepForMicroseconds(static_cast<int>( + kRefillsPerTune * std::chrono::microseconds(kTimePerRefill).count())); + // make a request so tuner can be triggered + rate_limiter->Request(1 /* bytes */, Env::IO_HIGH, stats.get(), + RateLimiter::OpType::kWrite); + new_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); + ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/repeatable_thread.h b/src/rocksdb/util/repeatable_thread.h new file mode 100644 index 00000000..967cc499 --- /dev/null +++ b/src/rocksdb/util/repeatable_thread.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <functional> +#include <string> + +#include "port/port.h" +#include "rocksdb/env.h" +#include "util/mock_time_env.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +class RepeatableThread { + public: + RepeatableThread(std::function<void()> function, + const std::string& thread_name, Env* env, uint64_t delay_us, + uint64_t initial_delay_us = 0) + : function_(function), + thread_name_("rocksdb:" + thread_name), + env_(env), + delay_us_(delay_us), + initial_delay_us_(initial_delay_us), + mutex_(env), + cond_var_(&mutex_), + running_(true), +#ifndef NDEBUG + waiting_(false), + run_count_(0), +#endif + thread_([this] { thread(); }) { + } + + void cancel() { + { + InstrumentedMutexLock l(&mutex_); + if (!running_) { + return; + } + running_ = false; + cond_var_.SignalAll(); + } + thread_.join(); + } + + bool IsRunning() { return running_; } + + ~RepeatableThread() { cancel(); } + +#ifndef NDEBUG + // Wait until RepeatableThread starting waiting, call the optional callback, + // then wait for one run of RepeatableThread. Tests can use provide a + // custom env object to mock time, and use the callback here to bump current + // time and trigger RepeatableThread. See repeatable_thread_test for example. + // + // Note: only support one caller of this method. + void TEST_WaitForRun(std::function<void()> callback = nullptr) { + InstrumentedMutexLock l(&mutex_); + while (!waiting_) { + cond_var_.Wait(); + } + uint64_t prev_count = run_count_; + if (callback != nullptr) { + callback(); + } + cond_var_.SignalAll(); + while (!(run_count_ > prev_count)) { + cond_var_.Wait(); + } + } +#endif + + private: + bool wait(uint64_t delay) { + InstrumentedMutexLock l(&mutex_); + if (running_ && delay > 0) { + uint64_t wait_until = env_->NowMicros() + delay; +#ifndef NDEBUG + waiting_ = true; + cond_var_.SignalAll(); +#endif + while (running_) { + cond_var_.TimedWait(wait_until); + if (env_->NowMicros() >= wait_until) { + break; + } + } +#ifndef NDEBUG + waiting_ = false; +#endif + } + return running_; + } + + void thread() { +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + // Set thread name. + auto thread_handle = thread_.native_handle(); + int ret __attribute__((__unused__)) = + pthread_setname_np(thread_handle, thread_name_.c_str()); + assert(ret == 0); +#endif +#endif + + assert(delay_us_ > 0); + if (!wait(initial_delay_us_)) { + return; + } + do { + function_(); +#ifndef NDEBUG + { + InstrumentedMutexLock l(&mutex_); + run_count_++; + cond_var_.SignalAll(); + } +#endif + } while (wait(delay_us_)); + } + + const std::function<void()> function_; + const std::string thread_name_; + Env* const env_; + const uint64_t delay_us_; + const uint64_t initial_delay_us_; + + // Mutex lock should be held when accessing running_, waiting_ + // and run_count_. + InstrumentedMutex mutex_; + InstrumentedCondVar cond_var_; + bool running_; +#ifndef NDEBUG + // RepeatableThread waiting for timeout. + bool waiting_; + // Times function_ had run. + uint64_t run_count_; +#endif + port::Thread thread_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/repeatable_thread_test.cc b/src/rocksdb/util/repeatable_thread_test.cc new file mode 100644 index 00000000..ee853c10 --- /dev/null +++ b/src/rocksdb/util/repeatable_thread_test.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <atomic> +#include <memory> + +#include "db/db_test_util.h" +#include "util/repeatable_thread.h" +#include "util/sync_point.h" +#include "util/testharness.h" + +class RepeatableThreadTest : public testing::Test { + public: + RepeatableThreadTest() + : mock_env_(new rocksdb::MockTimeEnv(rocksdb::Env::Default())) {} + + protected: + std::unique_ptr<rocksdb::MockTimeEnv> mock_env_; +}; + +TEST_F(RepeatableThreadTest, TimedTest) { + constexpr uint64_t kSecond = 1000000; // 1s = 1000000us + constexpr int kIteration = 3; + rocksdb::Env* env = rocksdb::Env::Default(); + rocksdb::port::Mutex mutex; + rocksdb::port::CondVar test_cv(&mutex); + int count = 0; + uint64_t prev_time = env->NowMicros(); + rocksdb::RepeatableThread thread( + [&] { + rocksdb::MutexLock l(&mutex); + count++; + uint64_t now = env->NowMicros(); + assert(count == 1 || prev_time + 1 * kSecond <= now); + prev_time = now; + if (count >= kIteration) { + test_cv.SignalAll(); + } + }, + "rt_test", env, 1 * kSecond); + // Wait for execution finish. + { + rocksdb::MutexLock l(&mutex); + while (count < kIteration) { + test_cv.Wait(); + } + } + + // Test cancel + thread.cancel(); +} + +TEST_F(RepeatableThreadTest, MockEnvTest) { + constexpr uint64_t kSecond = 1000000; // 1s = 1000000us + constexpr int kIteration = 3; + mock_env_->set_current_time(0); // in seconds + std::atomic<int> count{0}; + +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + // Obtain the current (real) time in seconds and add 1000 extra seconds + // to ensure that RepeatableThread::wait invokes TimedWait with a time + // greater than (real) current time. This is to prevent the TimedWait + // function from returning immediately without sleeping and releasing + // the mutex on certain platforms, e.g. OS X. If TimedWait returns + // immediately, the mutex will not be released, and + // RepeatableThread::TEST_WaitForRun never has a chance to execute the + // callback which, in this case, updates the result returned by + // mock_env->NowMicros. Consequently, RepeatableThread::wait cannot + // break out of the loop, causing test to hang. The extra 1000 seconds + // is a best-effort approach because there seems no reliable and + // deterministic way to provide the aforementioned guarantee. By the + // time RepeatableThread::wait is called, it is no guarantee that the + // delay + mock_env->NowMicros will be greater than the current real + // time. However, 1000 seconds should be sufficient in most cases. + uint64_t time_us = *reinterpret_cast<uint64_t*>(arg); + if (time_us < mock_env_->RealNowMicros()) { + *reinterpret_cast<uint64_t*>(arg) = mock_env_->RealNowMicros() + 1000; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +#endif // OS_MACOSX && !NDEBUG + + rocksdb::RepeatableThread thread([&] { count++; }, "rt_test", mock_env_.get(), + 1 * kSecond, 1 * kSecond); + for (int i = 1; i <= kIteration; i++) { + // Bump current time + thread.TEST_WaitForRun([&] { mock_env_->set_current_time(i); }); + } + // Test function should be exectued exactly kIteraion times. + ASSERT_EQ(kIteration, count.load()); + + // Test cancel + thread.cancel(); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/set_comparator.h b/src/rocksdb/util/set_comparator.h new file mode 100644 index 00000000..4ecd0040 --- /dev/null +++ b/src/rocksdb/util/set_comparator.h @@ -0,0 +1,22 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace rocksdb { +// A comparator to be used in std::set +struct SetComparator { + explicit SetComparator() : user_comparator_(BytewiseComparator()) {} + explicit SetComparator(const Comparator* user_comparator) + : user_comparator_(user_comparator ? user_comparator + : BytewiseComparator()) {} + bool operator()(const Slice& lhs, const Slice& rhs) const { + return user_comparator_->Compare(lhs, rhs) < 0; + } + + private: + const Comparator* user_comparator_; +}; +} // namespace rocksdb diff --git a/src/rocksdb/util/slice.cc b/src/rocksdb/util/slice.cc new file mode 100644 index 00000000..5e23ae0a --- /dev/null +++ b/src/rocksdb/util/slice.cc @@ -0,0 +1,212 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <algorithm> +#include "rocksdb/slice_transform.h" +#include "rocksdb/slice.h" +#include "util/string_util.h" +#include <stdio.h> + +namespace rocksdb { + +namespace { + +class FixedPrefixTransform : public SliceTransform { + private: + size_t prefix_len_; + std::string name_; + + public: + explicit FixedPrefixTransform(size_t prefix_len) + : prefix_len_(prefix_len), + // Note that if any part of the name format changes, it will require + // changes on options_helper in order to make RocksDBOptionsParser work + // for the new change. + // TODO(yhchiang): move serialization / deserializaion code inside + // the class implementation itself. + name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {} + + const char* Name() const override { return name_.c_str(); } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return Slice(src.data(), prefix_len_); + } + + bool InDomain(const Slice& src) const override { + return (src.size() >= prefix_len_); + } + + bool InRange(const Slice& dst) const override { + return (dst.size() == prefix_len_); + } + + bool FullLengthEnabled(size_t* len) const override { + *len = prefix_len_; + return true; + } + + bool SameResultWhenAppended(const Slice& prefix) const override { + return InDomain(prefix); + } +}; + +class CappedPrefixTransform : public SliceTransform { + private: + size_t cap_len_; + std::string name_; + + public: + explicit CappedPrefixTransform(size_t cap_len) + : cap_len_(cap_len), + // Note that if any part of the name format changes, it will require + // changes on options_helper in order to make RocksDBOptionsParser work + // for the new change. + // TODO(yhchiang): move serialization / deserializaion code inside + // the class implementation itself. + name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {} + + const char* Name() const override { return name_.c_str(); } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return Slice(src.data(), std::min(cap_len_, src.size())); + } + + bool InDomain(const Slice& /*src*/) const override { return true; } + + bool InRange(const Slice& dst) const override { + return (dst.size() <= cap_len_); + } + + bool FullLengthEnabled(size_t* len) const override { + *len = cap_len_; + return true; + } + + bool SameResultWhenAppended(const Slice& prefix) const override { + return prefix.size() >= cap_len_; + } +}; + +class NoopTransform : public SliceTransform { + public: + explicit NoopTransform() { } + + const char* Name() const override { return "rocksdb.Noop"; } + + Slice Transform(const Slice& src) const override { return src; } + + bool InDomain(const Slice& /*src*/) const override { return true; } + + bool InRange(const Slice& /*dst*/) const override { return true; } + + bool SameResultWhenAppended(const Slice& /*prefix*/) const override { + return false; + } +}; + +} + +// 2 small internal utility functions, for efficient hex conversions +// and no need for snprintf, toupper etc... +// Originally from wdt/util/EncryptionUtils.cpp - for ToString(true)/DecodeHex: +char toHex(unsigned char v) { + if (v <= 9) { + return '0' + v; + } + return 'A' + v - 10; +} +// most of the code is for validation/error check +int fromHex(char c) { + // toupper: + if (c >= 'a' && c <= 'f') { + c -= ('a' - 'A'); // aka 0x20 + } + // validation + if (c < '0' || (c > '9' && (c < 'A' || c > 'F'))) { + return -1; // invalid not 0-9A-F hex char + } + if (c <= '9') { + return c - '0'; + } + return c - 'A' + 10; +} + +Slice::Slice(const SliceParts& parts, std::string* buf) { + size_t length = 0; + for (int i = 0; i < parts.num_parts; ++i) { + length += parts.parts[i].size(); + } + buf->reserve(length); + + for (int i = 0; i < parts.num_parts; ++i) { + buf->append(parts.parts[i].data(), parts.parts[i].size()); + } + data_ = buf->data(); + size_ = buf->size(); +} + +// Return a string that contains the copy of the referenced data. +std::string Slice::ToString(bool hex) const { + std::string result; // RVO/NRVO/move + if (hex) { + result.reserve(2 * size_); + for (size_t i = 0; i < size_; ++i) { + unsigned char c = data_[i]; + result.push_back(toHex(c >> 4)); + result.push_back(toHex(c & 0xf)); + } + return result; + } else { + result.assign(data_, size_); + return result; + } +} + +// Originally from rocksdb/utilities/ldb_cmd.h +bool Slice::DecodeHex(std::string* result) const { + std::string::size_type len = size_; + if (len % 2) { + // Hex string must be even number of hex digits to get complete bytes back + return false; + } + if (!result) { + return false; + } + result->clear(); + result->reserve(len / 2); + + for (size_t i = 0; i < len;) { + int h1 = fromHex(data_[i++]); + if (h1 < 0) { + return false; + } + int h2 = fromHex(data_[i++]); + if (h2 < 0) { + return false; + } + result->push_back(static_cast<char>((h1 << 4) | h2)); + } + return true; +} + +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { + return new FixedPrefixTransform(prefix_len); +} + +const SliceTransform* NewCappedPrefixTransform(size_t cap_len) { + return new CappedPrefixTransform(cap_len); +} + +const SliceTransform* NewNoopTransform() { + return new NoopTransform; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/slice_transform_test.cc b/src/rocksdb/util/slice_transform_test.cc new file mode 100644 index 00000000..f91675cc --- /dev/null +++ b/src/rocksdb/util/slice_transform_test.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice_transform.h" + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "util/testharness.h" + +namespace rocksdb { + +class SliceTransformTest : public testing::Test {}; + +TEST_F(SliceTransformTest, CapPrefixTransform) { + std::string s; + s = "abcdefge"; + + std::unique_ptr<const SliceTransform> transform; + + transform.reset(NewCappedPrefixTransform(6)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdef"); + ASSERT_TRUE(transform->SameResultWhenAppended("123456")); + ASSERT_TRUE(transform->SameResultWhenAppended("1234567")); + ASSERT_TRUE(!transform->SameResultWhenAppended("12345")); + + transform.reset(NewCappedPrefixTransform(8)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(10)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform(s).ToString(), ""); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform("").ToString(), ""); +} + +class SliceTransformDBTest : public testing::Test { + private: + std::string dbname_; + Env* env_; + DB* db_; + + public: + SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) { + dbname_ = test::PerThreadDBPath("slice_transform_db_test"); + EXPECT_OK(DestroyDB(dbname_, last_options_)); + } + + ~SliceTransformDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, last_options_)); + } + + DB* db() { return db_; } + + // Return the current option configuration. + Options* GetOptions() { return &last_options_; } + + void DestroyAndReopen() { + // Destroy using last options + Destroy(); + ASSERT_OK(TryReopen()); + } + + void Destroy() { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + Status TryReopen() { + delete db_; + db_ = nullptr; + last_options_.create_if_missing = true; + + return DB::Open(last_options_, dbname_, &db_); + } + + Options last_options_; +}; + +namespace { +uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} +} // namespace + +TEST_F(SliceTransformDBTest, CapPrefix) { + last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8)); + last_options_.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ASSERT_OK(TryReopen()); + + ReadOptions ro; + FlushOptions fo; + WriteOptions wo; + + ASSERT_OK(db()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(db()->Put(wo, "foo", "bar")); + ASSERT_OK(db()->Put(wo, "foo3", "bar3")); + ASSERT_OK(db()->Flush(fo)); + + std::unique_ptr<Iterator> iter(db()->NewIterator(ro)); + + iter->Seek("foo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "bar"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U); + + iter->Seek("foo2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barbarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "foo"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barfoofoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U); + + iter->Seek("foobarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/sst_file_manager_impl.cc b/src/rocksdb/util/sst_file_manager_impl.cc new file mode 100644 index 00000000..6a770b10 --- /dev/null +++ b/src/rocksdb/util/sst_file_manager_impl.cc @@ -0,0 +1,527 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/sst_file_manager_impl.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> +#include <vector> + +#include "db/db_impl.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/sst_file_manager.h" +#include "util/mutexlock.h" +#include "util/sync_point.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE +SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger, + int64_t rate_bytes_per_sec, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) + : env_(env), + logger_(logger), + total_files_size_(0), + in_progress_files_size_(0), + compaction_buffer_size_(0), + cur_compactions_reserved_size_(0), + max_allowed_space_(0), + delete_scheduler_(env, rate_bytes_per_sec, logger.get(), this, + max_trash_db_ratio, bytes_max_delete_chunk), + cv_(&mu_), + closing_(false), + bg_thread_(nullptr), + reserved_disk_buffer_(0), + free_space_trigger_(0), + cur_instance_(nullptr) { +} + +SstFileManagerImpl::~SstFileManagerImpl() { + Close(); +} + +void SstFileManagerImpl::Close() { + { + MutexLock l(&mu_); + if (closing_) { + return; + } + closing_ = true; + cv_.SignalAll(); + } + if (bg_thread_) { + bg_thread_->join(); + } +} + +Status SstFileManagerImpl::OnAddFile(const std::string& file_path, + bool compaction) { + uint64_t file_size; + Status s = env_->GetFileSize(file_path, &file_size); + if (s.ok()) { + MutexLock l(&mu_); + OnAddFileImpl(file_path, file_size, compaction); + } + TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile"); + return s; +} + +Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) { + { + MutexLock l(&mu_); + OnDeleteFileImpl(file_path); + } + TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile"); + return Status::OK(); +} + +void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) { + MutexLock l(&mu_); + uint64_t size_added_by_compaction = 0; + for (size_t i = 0; i < c->num_input_levels(); i++) { + for (size_t j = 0; j < c->num_input_files(i); j++) { + FileMetaData* filemeta = c->input(i, j); + size_added_by_compaction += filemeta->fd.GetFileSize(); + } + } + cur_compactions_reserved_size_ -= size_added_by_compaction; + + auto new_files = c->edit()->GetNewFiles(); + for (auto& new_file : new_files) { + auto fn = TableFileName(c->immutable_cf_options()->cf_paths, + new_file.second.fd.GetNumber(), + new_file.second.fd.GetPathId()); + if (in_progress_files_.find(fn) != in_progress_files_.end()) { + auto tracked_file = tracked_files_.find(fn); + assert(tracked_file != tracked_files_.end()); + in_progress_files_size_ -= tracked_file->second; + in_progress_files_.erase(fn); + } + } +} + +Status SstFileManagerImpl::OnMoveFile(const std::string& old_path, + const std::string& new_path, + uint64_t* file_size) { + { + MutexLock l(&mu_); + if (file_size != nullptr) { + *file_size = tracked_files_[old_path]; + } + OnAddFileImpl(new_path, tracked_files_[old_path], false); + OnDeleteFileImpl(old_path); + } + TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile"); + return Status::OK(); +} + +void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) { + MutexLock l(&mu_); + max_allowed_space_ = max_allowed_space; +} + +void SstFileManagerImpl::SetCompactionBufferSize( + uint64_t compaction_buffer_size) { + MutexLock l(&mu_); + compaction_buffer_size_ = compaction_buffer_size; +} + +bool SstFileManagerImpl::IsMaxAllowedSpaceReached() { + MutexLock l(&mu_); + if (max_allowed_space_ <= 0) { + return false; + } + return total_files_size_ >= max_allowed_space_; +} + +bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() { + MutexLock l(&mu_); + if (max_allowed_space_ <= 0) { + return false; + } + return total_files_size_ + cur_compactions_reserved_size_ >= + max_allowed_space_; +} + +bool SstFileManagerImpl::EnoughRoomForCompaction( + ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs, + Status bg_error) { + MutexLock l(&mu_); + uint64_t size_added_by_compaction = 0; + // First check if we even have the space to do the compaction + for (size_t i = 0; i < inputs.size(); i++) { + for (size_t j = 0; j < inputs[i].size(); j++) { + FileMetaData* filemeta = inputs[i][j]; + size_added_by_compaction += filemeta->fd.GetFileSize(); + } + } + + // Update cur_compactions_reserved_size_ so concurrent compaction + // don't max out space + size_t needed_headroom = + cur_compactions_reserved_size_ + size_added_by_compaction + + compaction_buffer_size_; + if (max_allowed_space_ != 0 && + (needed_headroom + total_files_size_ > max_allowed_space_)) { + return false; + } + + // Implement more aggressive checks only if this DB instance has already + // seen a NoSpace() error. This is tin order to contain a single potentially + // misbehaving DB instance and prevent it from slowing down compactions of + // other DB instances + if (CheckFreeSpace() && bg_error == Status::NoSpace()) { + auto fn = + TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(), + inputs[0][0]->fd.GetPathId()); + uint64_t free_space = 0; + env_->GetFreeSpace(fn, &free_space); + // needed_headroom is based on current size reserved by compactions, + // minus any files created by running compactions as they would count + // against the reserved size. If user didn't specify any compaction + // buffer, add reserved_disk_buffer_ that's calculated by default so the + // compaction doesn't end up leaving nothing for logs and flush SSTs + if (compaction_buffer_size_ == 0) { + needed_headroom += reserved_disk_buffer_; + } + needed_headroom -= in_progress_files_size_; + if (free_space < needed_headroom + size_added_by_compaction) { + // We hit the condition of not enough disk space + ROCKS_LOG_ERROR(logger_, + "free space [%" PRIu64 + " bytes] is less than " + "needed headroom [%" ROCKSDB_PRIszt " bytes]\n", + free_space, needed_headroom); + return false; + } + } + + cur_compactions_reserved_size_ += size_added_by_compaction; + // Take a snapshot of cur_compactions_reserved_size_ for when we encounter + // a NoSpace error. + free_space_trigger_ = cur_compactions_reserved_size_; + return true; +} + +uint64_t SstFileManagerImpl::GetCompactionsReservedSize() { + MutexLock l(&mu_); + return cur_compactions_reserved_size_; +} + +uint64_t SstFileManagerImpl::GetTotalSize() { + MutexLock l(&mu_); + return total_files_size_; +} + +std::unordered_map<std::string, uint64_t> +SstFileManagerImpl::GetTrackedFiles() { + MutexLock l(&mu_); + return tracked_files_; +} + +int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() { + return delete_scheduler_.GetRateBytesPerSecond(); +} + +void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) { + return delete_scheduler_.SetRateBytesPerSecond(delete_rate); +} + +double SstFileManagerImpl::GetMaxTrashDBRatio() { + return delete_scheduler_.GetMaxTrashDBRatio(); +} + +void SstFileManagerImpl::SetMaxTrashDBRatio(double r) { + return delete_scheduler_.SetMaxTrashDBRatio(r); +} + +uint64_t SstFileManagerImpl::GetTotalTrashSize() { + return delete_scheduler_.GetTotalTrashSize(); +} + +void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size, + const std::string& path) { + MutexLock l(&mu_); + + reserved_disk_buffer_ += size; + if (path_.empty()) { + path_ = path; + } +} + +void SstFileManagerImpl::ClearError() { + while (true) { + MutexLock l(&mu_); + + if (closing_) { + return; + } + + uint64_t free_space; + Status s = env_->GetFreeSpace(path_, &free_space); + if (s.ok()) { + // In case of multi-DB instances, some of them may have experienced a + // soft error and some a hard error. In the SstFileManagerImpl, a hard + // error will basically override previously reported soft errors. Once + // we clear the hard error, we don't keep track of previous errors for + // now + if (bg_err_.severity() == Status::Severity::kHardError) { + if (free_space < reserved_disk_buffer_) { + ROCKS_LOG_ERROR(logger_, + "free space [%" PRIu64 + " bytes] is less than " + "required disk buffer [%" PRIu64 " bytes]\n", + free_space, reserved_disk_buffer_); + ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n"); + s = Status::NoSpace(); + } + } else if (bg_err_.severity() == Status::Severity::kSoftError) { + if (free_space < free_space_trigger_) { + ROCKS_LOG_WARN(logger_, + "free space [%" PRIu64 + " bytes] is less than " + "free space for compaction trigger [%" PRIu64 + " bytes]\n", + free_space, free_space_trigger_); + ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n"); + s = Status::NoSpace(); + } + } + } + + // Someone could have called CancelErrorRecovery() and the list could have + // become empty, so check again here + if (s.ok() && !error_handler_list_.empty()) { + auto error_handler = error_handler_list_.front(); + // Since we will release the mutex, set cur_instance_ to signal to the + // shutdown thread, if it calls // CancelErrorRecovery() the meantime, + // to indicate that this DB instance is busy. The DB instance is + // guaranteed to not be deleted before RecoverFromBGError() returns, + // since the ErrorHandler::recovery_in_prog_ flag would be true + cur_instance_ = error_handler; + mu_.Unlock(); + s = error_handler->RecoverFromBGError(); + mu_.Lock(); + // The DB instance might have been deleted while we were + // waiting for the mutex, so check cur_instance_ to make sure its + // still non-null + if (cur_instance_) { + // Check for error again, since the instance may have recovered but + // immediately got another error. If that's the case, and the new + // error is also a NoSpace() non-fatal error, leave the instance in + // the list + Status err = cur_instance_->GetBGError(); + if (s.ok() && err == Status::NoSpace() && + err.severity() < Status::Severity::kFatalError) { + s = err; + } + cur_instance_ = nullptr; + } + + if (s.ok() || s.IsShutdownInProgress() || + (!s.ok() && s.severity() >= Status::Severity::kFatalError)) { + // If shutdown is in progress, abandon this handler instance + // and continue with the others + error_handler_list_.pop_front(); + } + } + + if (!error_handler_list_.empty()) { + // If there are more instances to be recovered, reschedule after 5 + // seconds + int64_t wait_until = env_->NowMicros() + 5000000; + cv_.TimedWait(wait_until); + } + + // Check again for error_handler_list_ empty, as a DB instance shutdown + // could have removed it from the queue while we were in timed wait + if (error_handler_list_.empty()) { + ROCKS_LOG_INFO(logger_, "Clearing error\n"); + bg_err_ = Status::OK(); + return; + } + } +} + +void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler, + Status bg_error) { + MutexLock l(&mu_); + if (bg_error.severity() == Status::Severity::kSoftError) { + if (bg_err_.ok()) { + // Setting bg_err_ basically means we're in degraded mode + // Assume that all pending compactions will fail similarly. The trigger + // for clearing this condition is set to current compaction reserved + // size, so we stop checking disk space available in + // EnoughRoomForCompaction once this much free space is available + bg_err_ = bg_error; + } + } else if (bg_error.severity() == Status::Severity::kHardError) { + bg_err_ = bg_error; + } else { + assert(false); + } + + // If this is the first instance of this error, kick of a thread to poll + // and recover from this condition + if (error_handler_list_.empty()) { + error_handler_list_.push_back(handler); + // Release lock before calling join. Its ok to do so because + // error_handler_list_ is now non-empty, so no other invocation of this + // function will execute this piece of code + mu_.Unlock(); + if (bg_thread_) { + bg_thread_->join(); + } + // Start a new thread. The previous one would have exited. + bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this)); + mu_.Lock(); + } else { + // Check if this DB instance is already in the list + for (auto iter = error_handler_list_.begin(); + iter != error_handler_list_.end(); ++iter) { + if ((*iter) == handler) { + return; + } + } + error_handler_list_.push_back(handler); + } +} + +bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) { + MutexLock l(&mu_); + + if (cur_instance_ == handler) { + // This instance is currently busy attempting to recover + // Nullify it so the recovery thread doesn't attempt to access it again + cur_instance_ = nullptr; + return false; + } + + for (auto iter = error_handler_list_.begin(); + iter != error_handler_list_.end(); ++iter) { + if ((*iter) == handler) { + error_handler_list_.erase(iter); + return true; + } + } + return false; +} + +Status SstFileManagerImpl::ScheduleFileDeletion( + const std::string& file_path, const std::string& path_to_sync, + const bool force_bg) { + TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion"); + return delete_scheduler_.DeleteFile(file_path, path_to_sync, + force_bg); +} + +void SstFileManagerImpl::WaitForEmptyTrash() { + delete_scheduler_.WaitForEmptyTrash(); +} + +void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path, + uint64_t file_size, bool compaction) { + auto tracked_file = tracked_files_.find(file_path); + if (tracked_file != tracked_files_.end()) { + // File was added before, we will just update the size + assert(!compaction); + total_files_size_ -= tracked_file->second; + total_files_size_ += file_size; + cur_compactions_reserved_size_ -= file_size; + } else { + total_files_size_ += file_size; + if (compaction) { + // Keep track of the size of files created by in-progress compactions. + // When calculating whether there's enough headroom for new compactions, + // this will be subtracted from cur_compactions_reserved_size_. + // Otherwise, compactions will be double counted. + in_progress_files_size_ += file_size; + in_progress_files_.insert(file_path); + } + } + tracked_files_[file_path] = file_size; +} + +void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) { + auto tracked_file = tracked_files_.find(file_path); + if (tracked_file == tracked_files_.end()) { + // File is not tracked + assert(in_progress_files_.find(file_path) == in_progress_files_.end()); + return; + } + + total_files_size_ -= tracked_file->second; + // Check if it belonged to an in-progress compaction + if (in_progress_files_.find(file_path) != in_progress_files_.end()) { + in_progress_files_size_ -= tracked_file->second; + in_progress_files_.erase(file_path); + } + tracked_files_.erase(tracked_file); +} + +SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log, + std::string trash_dir, + int64_t rate_bytes_per_sec, + bool delete_existing_trash, Status* status, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk) { + SstFileManagerImpl* res = + new SstFileManagerImpl(env, info_log, rate_bytes_per_sec, + max_trash_db_ratio, bytes_max_delete_chunk); + + // trash_dir is deprecated and not needed anymore, but if user passed it + // we will still remove files in it. + Status s; + if (delete_existing_trash && trash_dir != "") { + std::vector<std::string> files_in_trash; + s = env->GetChildren(trash_dir, &files_in_trash); + if (s.ok()) { + for (const std::string& trash_file : files_in_trash) { + if (trash_file == "." || trash_file == "..") { + continue; + } + + std::string path_in_trash = trash_dir + "/" + trash_file; + res->OnAddFile(path_in_trash); + Status file_delete = + res->ScheduleFileDeletion(path_in_trash, trash_dir); + if (s.ok() && !file_delete.ok()) { + s = file_delete; + } + } + } + } + + if (status) { + *status = s; + } + + return res; +} + +#else + +SstFileManager* NewSstFileManager(Env* /*env*/, + std::shared_ptr<Logger> /*info_log*/, + std::string /*trash_dir*/, + int64_t /*rate_bytes_per_sec*/, + bool /*delete_existing_trash*/, + Status* status, double /*max_trash_db_ratio*/, + uint64_t /*bytes_max_delete_chunk*/) { + if (status) { + *status = + Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE"); + } + return nullptr; +} + +#endif // ROCKSDB_LITE + +} // namespace rocksdb diff --git a/src/rocksdb/util/sst_file_manager_impl.h b/src/rocksdb/util/sst_file_manager_impl.h new file mode 100644 index 00000000..211b4fa7 --- /dev/null +++ b/src/rocksdb/util/sst_file_manager_impl.h @@ -0,0 +1,189 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <string> + +#include "port/port.h" + +#include "db/compaction.h" +#include "db/error_handler.h" +#include "rocksdb/sst_file_manager.h" +#include "util/delete_scheduler.h" + +namespace rocksdb { + +class Env; +class Logger; + +// SstFileManager is used to track SST files in the DB and control there +// deletion rate. +// All SstFileManager public functions are thread-safe. +class SstFileManagerImpl : public SstFileManager { + public: + explicit SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger, + int64_t rate_bytes_per_sec, + double max_trash_db_ratio, + uint64_t bytes_max_delete_chunk); + + ~SstFileManagerImpl(); + + // DB will call OnAddFile whenever a new sst file is added. + Status OnAddFile(const std::string& file_path, bool compaction = false); + + // DB will call OnDeleteFile whenever an sst file is deleted. + Status OnDeleteFile(const std::string& file_path); + + // DB will call OnMoveFile whenever an sst file is move to a new path. + Status OnMoveFile(const std::string& old_path, const std::string& new_path, + uint64_t* file_size = nullptr); + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST files exceeds max_allowed_space, writes to + // RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature, maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override; + + void SetCompactionBufferSize(uint64_t compaction_buffer_size) override; + + // Return true if the total size of SST files exceeded the maximum allowed + // space usage. + // + // thread-safe. + bool IsMaxAllowedSpaceReached() override; + + bool IsMaxAllowedSpaceReachedIncludingCompactions() override; + + // Returns true is there is enough (approximate) space for the specified + // compaction. Space is approximate because this function conservatively + // estimates how much space is currently being used by compactions (i.e. + // if a compaction has started, this function bumps the used space by + // the full compaction size). + bool EnoughRoomForCompaction(ColumnFamilyData* cfd, + const std::vector<CompactionInputFiles>& inputs, + Status bg_error); + + // Bookkeeping so total_file_sizes_ goes back to normal after compaction + // finishes + void OnCompactionCompletion(Compaction* c); + + uint64_t GetCompactionsReservedSize(); + + // Return the total size of all tracked files. + uint64_t GetTotalSize() override; + + // Return a map containing all tracked files and there corresponding sizes. + std::unordered_map<std::string, uint64_t> GetTrackedFiles() override; + + // Return delete rate limit in bytes per second. + virtual int64_t GetDeleteRateBytesPerSecond() override; + + // Update the delete rate limit in bytes per second. + virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override; + + // Return trash/DB size ratio where new files will be deleted immediately + virtual double GetMaxTrashDBRatio() override; + + // Update trash/DB size ratio where new files will be deleted immediately + virtual void SetMaxTrashDBRatio(double ratio) override; + + // Return the total size of trash files + uint64_t GetTotalTrashSize() override; + + // Called by each DB instance using this sst file manager to reserve + // disk buffer space for recovery from out of space errors + void ReserveDiskBuffer(uint64_t buffer, const std::string& path); + + // Set a flag upon encountering disk full. May enqueue the ErrorHandler + // instance for background polling and recovery + void StartErrorRecovery(ErrorHandler* db, Status bg_error); + + // Remove the given Errorhandler instance from the recovery queue. Its + // not guaranteed + bool CancelErrorRecovery(ErrorHandler* db); + + // Mark file as trash and schedule it's deletion. If force_bg is set, it + // forces the file to be deleting in the background regardless of DB size, + // except when rate limited delete is disabled + virtual Status ScheduleFileDeletion(const std::string& file_path, + const std::string& dir_to_sync, + const bool force_bg = false); + + // Wait for all files being deleteing in the background to finish or for + // destructor to be called. + virtual void WaitForEmptyTrash(); + + DeleteScheduler* delete_scheduler() { return &delete_scheduler_; } + + // Stop the error recovery background thread. This should be called only + // once in the object's lifetime, and before the destructor + void Close(); + + private: + // REQUIRES: mutex locked + void OnAddFileImpl(const std::string& file_path, uint64_t file_size, + bool compaction); + // REQUIRES: mutex locked + void OnDeleteFileImpl(const std::string& file_path); + + void ClearError(); + bool CheckFreeSpace() { + return bg_err_.severity() == Status::Severity::kSoftError; + } + + Env* env_; + std::shared_ptr<Logger> logger_; + // Mutex to protect tracked_files_, total_files_size_ + port::Mutex mu_; + // The summation of the sizes of all files in tracked_files_ map + uint64_t total_files_size_; + // The summation of all output files of in-progress compactions + uint64_t in_progress_files_size_; + // Compactions should only execute if they can leave at least + // this amount of buffer space for logs and flushes + uint64_t compaction_buffer_size_; + // Estimated size of the current ongoing compactions + uint64_t cur_compactions_reserved_size_; + // A map containing all tracked files and there sizes + // file_path => file_size + std::unordered_map<std::string, uint64_t> tracked_files_; + // A set of files belonging to in-progress compactions + std::unordered_set<std::string> in_progress_files_; + // The maximum allowed space (in bytes) for sst files. + uint64_t max_allowed_space_; + // DeleteScheduler used to throttle file deletition. + DeleteScheduler delete_scheduler_; + port::CondVar cv_; + // Flag to force error recovery thread to exit + bool closing_; + // Background error recovery thread + std::unique_ptr<port::Thread> bg_thread_; + // A path in the filesystem corresponding to this SFM. This is used for + // calling Env::GetFreeSpace. Posix requires a path in the filesystem + std::string path_; + // Save the current background error + Status bg_err_; + // Amount of free disk headroom before allowing recovery from hard errors + uint64_t reserved_disk_buffer_; + // For soft errors, amount of free disk space before we can allow + // compactions to run full throttle. If disk space is below this trigger, + // compactions will be gated by free disk space > input size + uint64_t free_space_trigger_; + // List of database error handler instances tracked by this sst file manager + std::list<ErrorHandler*> error_handler_list_; + // Pointer to ErrorHandler instance that is currently processing recovery + ErrorHandler* cur_instance_; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/util/status.cc b/src/rocksdb/util/status.cc new file mode 100644 index 00000000..c66bf6f8 --- /dev/null +++ b/src/rocksdb/util/status.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/status.h" +#include <stdio.h> +#ifdef OS_WIN +#include <string.h> +#endif +#include <cstring> +#include "port/port.h" + +namespace rocksdb { + +const char* Status::CopyState(const char* state) { +#ifdef OS_WIN + const size_t cch = std::strlen(state) + 1; // +1 for the null terminator + char* result = new char[cch]; + errno_t ret; + ret = strncpy_s(result, cch, state, cch - 1); + result[cch - 1] = '\0'; + assert(ret == 0); + return result; +#else + const size_t cch = std::strlen(state) + 1; // +1 for the null terminator + return std::strncpy(new char[cch], state, cch); +#endif +} + +static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = { + "", // kNone + "Timeout Acquiring Mutex", // kMutexTimeout + "Timeout waiting to lock key", // kLockTimeout + "Failed to acquire lock due to max_num_locks limit", // kLockLimit + "No space left on device", // kNoSpace + "Deadlock", // kDeadlock + "Stale file handle", // kStaleFile + "Memory limit reached", // kMemoryLimit + "Space limit reached", // kSpaceLimit + "No such file or directory", // kPathNotFound +}; + +Status::Status(Code _code, SubCode _subcode, const Slice& msg, + const Slice& msg2) + : code_(_code), subcode_(_subcode), sev_(kNoError) { + assert(code_ != kOk); + assert(subcode_ != kMaxSubCode); + const size_t len1 = msg.size(); + const size_t len2 = msg2.size(); + const size_t size = len1 + (len2 ? (2 + len2) : 0); + char* const result = new char[size + 1]; // +1 for null terminator + memcpy(result, msg.data(), len1); + if (len2) { + result[len1] = ':'; + result[len1 + 1] = ' '; + memcpy(result + len1 + 2, msg2.data(), len2); + } + result[size] = '\0'; // null terminator for C style string + state_ = result; +} + +std::string Status::ToString() const { + char tmp[30]; + const char* type; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge in progress: "; + break; + case kIncomplete: + type = "Result incomplete: "; + break; + case kShutdownInProgress: + type = "Shutdown in progress: "; + break; + case kTimedOut: + type = "Operation timed out: "; + break; + case kAborted: + type = "Operation aborted: "; + break; + case kBusy: + type = "Resource busy: "; + break; + case kExpired: + type = "Operation expired: "; + break; + case kTryAgain: + type = "Operation failed. Try again.: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast<int>(code())); + type = tmp; + break; + } + std::string result(type); + if (subcode_ != kNone) { + uint32_t index = static_cast<int32_t>(subcode_); + assert(sizeof(msgs) > index); + result.append(msgs[index]); + } + + if (state_ != nullptr) { + result.append(state_); + } + return result; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/stderr_logger.h b/src/rocksdb/util/stderr_logger.h new file mode 100644 index 00000000..8612fce0 --- /dev/null +++ b/src/rocksdb/util/stderr_logger.h @@ -0,0 +1,31 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stdarg.h> +#include <stdio.h> + +#include "rocksdb/env.h" + +namespace rocksdb { + +// Prints logs to stderr for faster debugging +class StderrLogger : public Logger { + public: + explicit StderrLogger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : Logger(log_level) {} + + // Brings overloaded Logv()s into scope so they're not hidden when we override + // a subset of them. + using Logger::Logv; + + virtual void Logv(const char* format, va_list ap) override { + vfprintf(stderr, format, ap); + fprintf(stderr, "\n"); + } +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/stop_watch.h b/src/rocksdb/util/stop_watch.h new file mode 100644 index 00000000..afa708e3 --- /dev/null +++ b/src/rocksdb/util/stop_watch.h @@ -0,0 +1,118 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "monitoring/statistics.h" +#include "rocksdb/env.h" + +namespace rocksdb { +// Auto-scoped. +// Records the measure time into the corresponding histogram if statistics +// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr +// and overwrite is true, it will be added to *elapsed if overwrite is false. +class StopWatch { + public: + StopWatch(Env* const env, Statistics* statistics, const uint32_t hist_type, + uint64_t* elapsed = nullptr, bool overwrite = true, + bool delay_enabled = false) + : env_(env), + statistics_(statistics), + hist_type_(hist_type), + elapsed_(elapsed), + overwrite_(overwrite), + stats_enabled_(statistics && + statistics->get_stats_level() >= + StatsLevel::kExceptTimers && + statistics->HistEnabledForType(hist_type)), + delay_enabled_(delay_enabled), + total_delay_(0), + delay_start_time_(0), + start_time_((stats_enabled_ || elapsed != nullptr) ? env->NowMicros() + : 0) {} + + ~StopWatch() { + if (elapsed_) { + if (overwrite_) { + *elapsed_ = env_->NowMicros() - start_time_; + } else { + *elapsed_ += env_->NowMicros() - start_time_; + } + } + if (elapsed_ && delay_enabled_) { + *elapsed_ -= total_delay_; + } + if (stats_enabled_) { + statistics_->reportTimeToHistogram( + hist_type_, (elapsed_ != nullptr) + ? *elapsed_ + : (env_->NowMicros() - start_time_)); + } + } + + void DelayStart() { + // if delay_start_time_ is not 0, it means we are already tracking delay, + // so delay_start_time_ should not be overwritten + if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) { + delay_start_time_ = env_->NowMicros(); + } + } + + void DelayStop() { + if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) { + total_delay_ += env_->NowMicros() - delay_start_time_; + } + // reset to 0 means currently no delay is being tracked, so two consecutive + // calls to DelayStop will not increase total_delay_ + delay_start_time_ = 0; + } + + uint64_t GetDelay() const { return delay_enabled_ ? total_delay_ : 0; } + + uint64_t start_time() const { return start_time_; } + + private: + Env* const env_; + Statistics* statistics_; + const uint32_t hist_type_; + uint64_t* elapsed_; + bool overwrite_; + bool stats_enabled_; + bool delay_enabled_; + uint64_t total_delay_; + uint64_t delay_start_time_; + const uint64_t start_time_; +}; + +// a nano second precision stopwatch +class StopWatchNano { + public: + explicit StopWatchNano(Env* const env, bool auto_start = false) + : env_(env), start_(0) { + if (auto_start) { + Start(); + } + } + + void Start() { start_ = env_->NowNanos(); } + + uint64_t ElapsedNanos(bool reset = false) { + auto now = env_->NowNanos(); + auto elapsed = now - start_; + if (reset) { + start_ = now; + } + return elapsed; + } + + uint64_t ElapsedNanosSafe(bool reset = false) { + return (env_ != nullptr) ? ElapsedNanos(reset) : 0U; + } + + private: + Env* const env_; + uint64_t start_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/string_util.cc b/src/rocksdb/util/string_util.cc new file mode 100644 index 00000000..26e6759a --- /dev/null +++ b/src/rocksdb/util/string_util.cc @@ -0,0 +1,403 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "util/string_util.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <errno.h> +#include <inttypes.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <algorithm> +#include <cmath> +#include <sstream> +#include <string> +#include <utility> +#include <vector> +#include "rocksdb/env.h" +#include "port/port.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +const std::string kNullptrString = "nullptr"; + +std::vector<std::string> StringSplit(const std::string& arg, char delim) { + std::vector<std::string> splits; + std::stringstream ss(arg); + std::string item; + while (std::getline(ss, item, delim)) { + splits.push_back(item); + } + return splits; +} + +// for micros < 10ms, print "XX us". +// for micros < 10sec, print "XX ms". +// for micros >= 10 sec, print "XX sec". +// for micros <= 1 hour, print Y:X M:S". +// for micros > 1 hour, print Z:Y:X H:M:S". +int AppendHumanMicros(uint64_t micros, char* output, int len, + bool fixed_format) { + if (micros < 10000 && !fixed_format) { + return snprintf(output, len, "%" PRIu64 " us", micros); + } else if (micros < 10000000 && !fixed_format) { + return snprintf(output, len, "%.3lf ms", + static_cast<double>(micros) / 1000); + } else if (micros < 1000000l * 60 && !fixed_format) { + return snprintf(output, len, "%.3lf sec", + static_cast<double>(micros) / 1000000); + } else if (micros < 1000000ll * 60 * 60 && !fixed_format) { + return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S", + micros / 1000000 / 60, + static_cast<double>(micros % 60000000) / 1000000); + } else { + return snprintf(output, len, "%02" PRIu64 ":%02" PRIu64 ":%05.3f H:M:S", + micros / 1000000 / 3600, (micros / 1000000 / 60) % 60, + static_cast<double>(micros % 60000000) / 1000000); + } +} + +// for sizes >=10TB, print "XXTB" +// for sizes >=10GB, print "XXGB" +// etc. +// append file size summary to output and return the len +int AppendHumanBytes(uint64_t bytes, char* output, int len) { + const uint64_t ull10 = 10; + if (bytes >= ull10 << 40) { + return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40); + } else if (bytes >= ull10 << 30) { + return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30); + } else if (bytes >= ull10 << 20) { + return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20); + } else if (bytes >= ull10 << 10) { + return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10); + } else { + return snprintf(output, len, "%" PRIu64 "B", bytes); + } +} + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%" PRIu64, num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast<unsigned int>(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string NumberToHumanString(int64_t num) { + char buf[19]; + int64_t absnum = num < 0 ? -num : num; + if (absnum < 10000) { + snprintf(buf, sizeof(buf), "%" PRIi64, num); + } else if (absnum < 10000000) { + snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000); + } else if (absnum < 10000000000LL) { + snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000); + } else { + snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000); + } + return std::string(buf); +} + +std::string BytesToHumanString(uint64_t bytes) { + const char* size_name[] = {"KB", "MB", "GB", "TB"}; + double final_size = static_cast<double>(bytes); + size_t size_idx; + + // always start with KB + final_size /= 1024; + size_idx = 0; + + while (size_idx < 3 && final_size >= 1024) { + final_size /= 1024; + size_idx++; + } + + char buf[20]; + snprintf(buf, sizeof(buf), "%.2f %s", final_size, size_name[size_idx]); + return std::string(buf); +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const unsigned int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0); + if (v > kMaxUint64 / 10 || + (v == kMaxUint64 / 10 && delta > kMaxUint64 % 10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +bool isSpecialChar(const char c) { + if (c == '\\' || c == '#' || c == ':' || c == '\r' || c == '\n') { + return true; + } + return false; +} + +namespace { +using CharMap = std::pair<char, char>; +} + +char UnescapeChar(const char c) { + static const CharMap convert_map[] = {{'r', '\r'}, {'n', '\n'}}; + + auto iter = std::find_if(std::begin(convert_map), std::end(convert_map), + [c](const CharMap& p) { return p.first == c; }); + + if (iter == std::end(convert_map)) { + return c; + } + return iter->second; +} + +char EscapeChar(const char c) { + static const CharMap convert_map[] = {{'\n', 'n'}, {'\r', 'r'}}; + + auto iter = std::find_if(std::begin(convert_map), std::end(convert_map), + [c](const CharMap& p) { return p.first == c; }); + + if (iter == std::end(convert_map)) { + return c; + } + return iter->second; +} + +std::string EscapeOptionString(const std::string& raw_string) { + std::string output; + for (auto c : raw_string) { + if (isSpecialChar(c)) { + output += '\\'; + output += EscapeChar(c); + } else { + output += c; + } + } + + return output; +} + +std::string UnescapeOptionString(const std::string& escaped_string) { + bool escaped = false; + std::string output; + + for (auto c : escaped_string) { + if (escaped) { + output += UnescapeChar(c); + escaped = false; + } else { + if (c == '\\') { + escaped = true; + continue; + } + output += c; + } + } + return output; +} + +std::string trim(const std::string& str) { + if (str.empty()) return std::string(); + size_t start = 0; + size_t end = str.size() - 1; + while (isspace(str[start]) != 0 && start < end) { + ++start; + } + while (isspace(str[end]) != 0 && start < end) { + --end; + } + if (start <= end) { + return str.substr(start, end - start + 1); + } + return std::string(); +} + +#ifndef ROCKSDB_LITE + +bool ParseBoolean(const std::string& type, const std::string& value) { + if (value == "true" || value == "1") { + return true; + } else if (value == "false" || value == "0") { + return false; + } + throw std::invalid_argument(type); +} + +uint32_t ParseUint32(const std::string& value) { + uint64_t num = ParseUint64(value); + if ((num >> 32LL) == 0) { + return static_cast<uint32_t>(num); + } else { + throw std::out_of_range(value); + } +} + +int32_t ParseInt32(const std::string& value) { + int64_t num = ParseInt64(value); + if (num <= port::kMaxInt32 && num >= port::kMinInt32) { + return static_cast<int32_t>(num); + } else { + throw std::out_of_range(value); + } +} + +#endif + +uint64_t ParseUint64(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + uint64_t num = std::stoull(value.c_str(), &endchar); +#else + char* endptr; + uint64_t num = std::strtoul(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + +int64_t ParseInt64(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + int64_t num = std::stoll(value.c_str(), &endchar); +#else + char* endptr; + int64_t num = std::strtoll(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + +int ParseInt(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + int num = std::stoi(value.c_str(), &endchar); +#else + char* endptr; + int num = std::strtoul(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10; + else if (c == 'm' || c == 'M') + num <<= 20; + else if (c == 'g' || c == 'G') + num <<= 30; + } + + return num; +} + +double ParseDouble(const std::string& value) { +#ifndef CYGWIN + return std::stod(value); +#else + return std::strtod(value.c_str(), 0); +#endif +} + +size_t ParseSizeT(const std::string& value) { + return static_cast<size_t>(ParseUint64(value)); +} + +std::vector<int> ParseVectorInt(const std::string& value) { + std::vector<int> result; + size_t start = 0; + while (start < value.size()) { + size_t end = value.find(':', start); + if (end == std::string::npos) { + result.push_back(ParseInt(value.substr(start))); + break; + } else { + result.push_back(ParseInt(value.substr(start, end - start))); + start = end + 1; + } + } + return result; +} + +bool SerializeIntVector(const std::vector<int>& vec, std::string* value) { + *value = ""; + for (size_t i = 0; i < vec.size(); ++i) { + if (i > 0) { + *value += ":"; + } + *value += ToString(vec[i]); + } + return true; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/string_util.h b/src/rocksdb/util/string_util.h new file mode 100644 index 00000000..6e125ddf --- /dev/null +++ b/src/rocksdb/util/string_util.h @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include <sstream> +#include <string> +#include <unordered_map> +#include <vector> + +namespace rocksdb { + +class Slice; + +extern std::vector<std::string> StringSplit(const std::string& arg, char delim); + +template <typename T> +inline std::string ToString(T value) { +#if !(defined OS_ANDROID) && !(defined CYGWIN) && !(defined OS_FREEBSD) + return std::to_string(value); +#else + // Andorid or cygwin doesn't support all of C++11, std::to_string() being + // one of the not supported features. + std::ostringstream os; + os << value; + return os.str(); +#endif +} + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a string printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of num. +// for num >= 10.000, prints "xxK" +// for num >= 10.000.000, prints "xxM" +// for num >= 10.000.000.000, prints "xxG" +extern std::string NumberToHumanString(int64_t num); + +// Return a human-readable version of bytes +// ex: 1048576 -> 1.00 GB +extern std::string BytesToHumanString(uint64_t bytes); + +// Append a human-readable time in micros. +int AppendHumanMicros(uint64_t micros, char* output, int len, + bool fixed_format); + +// Append a human-readable size in bytes +int AppendHumanBytes(uint64_t bytes, char* output, int len); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +// Returns true if the input char "c" is considered as a special character +// that will be escaped when EscapeOptionString() is called. +// +// @param c the input char +// @return true if the input char "c" is considered as a special character. +// @see EscapeOptionString +bool isSpecialChar(const char c); + +// If the input char is an escaped char, it will return the its +// associated raw-char. Otherwise, the function will simply return +// the original input char. +char UnescapeChar(const char c); + +// If the input char is a control char, it will return the its +// associated escaped char. Otherwise, the function will simply return +// the original input char. +char EscapeChar(const char c); + +// Converts a raw string to an escaped string. Escaped-characters are +// defined via the isSpecialChar() function. When a char in the input +// string "raw_string" is classified as a special characters, then it +// will be prefixed by '\' in the output. +// +// It's inverse function is UnescapeOptionString(). +// @param raw_string the input string +// @return the '\' escaped string of the input "raw_string" +// @see isSpecialChar, UnescapeOptionString +std::string EscapeOptionString(const std::string& raw_string); + +// The inverse function of EscapeOptionString. It converts +// an '\' escaped string back to a raw string. +// +// @param escaped_string the input '\' escaped string +// @return the raw string of the input "escaped_string" +std::string UnescapeOptionString(const std::string& escaped_string); + +std::string trim(const std::string& str); + +#ifndef ROCKSDB_LITE +bool ParseBoolean(const std::string& type, const std::string& value); + +uint32_t ParseUint32(const std::string& value); + +int32_t ParseInt32(const std::string& value); +#endif + +uint64_t ParseUint64(const std::string& value); + +int ParseInt(const std::string& value); + + +int64_t ParseInt64(const std::string& value); + +double ParseDouble(const std::string& value); + +size_t ParseSizeT(const std::string& value); + +std::vector<int> ParseVectorInt(const std::string& value); + +bool SerializeIntVector(const std::vector<int>& vec, std::string* value); + +extern const std::string kNullptrString; + +} // namespace rocksdb diff --git a/src/rocksdb/util/sync_point.cc b/src/rocksdb/util/sync_point.cc new file mode 100644 index 00000000..4599c256 --- /dev/null +++ b/src/rocksdb/util/sync_point.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/sync_point.h" +#include "util/sync_point_impl.h" + +int rocksdb_kill_odds = 0; +std::vector<std::string> rocksdb_kill_prefix_blacklist; + +#ifndef NDEBUG +namespace rocksdb { + +SyncPoint* SyncPoint::GetInstance() { + static SyncPoint sync_point; + return &sync_point; +} + +SyncPoint::SyncPoint() : impl_(new Data) {} + +SyncPoint:: ~SyncPoint() { + delete impl_; +} + +void SyncPoint::LoadDependency(const std::vector<SyncPointPair>& dependencies) { + impl_->LoadDependency(dependencies); +} + +void SyncPoint::LoadDependencyAndMarkers( + const std::vector<SyncPointPair>& dependencies, + const std::vector<SyncPointPair>& markers) { + impl_->LoadDependencyAndMarkers(dependencies, markers); +} + +void SyncPoint::SetCallBack(const std::string& point, + const std::function<void(void*)>& callback) { + impl_->SetCallBack(point, callback); +} + +void SyncPoint::ClearCallBack(const std::string& point) { + impl_->ClearCallBack(point); +} + +void SyncPoint::ClearAllCallBacks() { + impl_->ClearAllCallBacks(); +} + +void SyncPoint::EnableProcessing() { + impl_->EnableProcessing(); +} + +void SyncPoint::DisableProcessing() { + impl_->DisableProcessing(); +} + +void SyncPoint::ClearTrace() { + impl_->ClearTrace(); +} + +void SyncPoint::Process(const std::string& point, void* cb_arg) { + impl_->Process(point, cb_arg); +} + +} // namespace rocksdb +#endif // NDEBUG diff --git a/src/rocksdb/util/sync_point.h b/src/rocksdb/util/sync_point.h new file mode 100644 index 00000000..cb4b1e71 --- /dev/null +++ b/src/rocksdb/util/sync_point.h @@ -0,0 +1,140 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <assert.h> +#include <functional> +#include <mutex> +#include <string> +#include <thread> +#include <vector> + +// This is only set from db_stress.cc and for testing only. +// If non-zero, kill at various points in source code with probability 1/this +extern int rocksdb_kill_odds; +// If kill point has a prefix on this list, will skip killing. +extern std::vector<std::string> rocksdb_kill_prefix_blacklist; + +#ifdef NDEBUG +// empty in release build +#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds) +#else + +namespace rocksdb { +// Kill the process with probability 1/odds for testing. +extern void TestKillRandom(std::string kill_point, int odds, + const std::string& srcfile, int srcline); + +// To avoid crashing always at some frequently executed codepaths (during +// kill random test), use this factor to reduce odds +#define REDUCE_ODDS 2 +#define REDUCE_ODDS2 4 + +#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds) \ + { \ + if (rocksdb_kill_odds > 0) { \ + TestKillRandom(kill_point, rocksdb_kill_odds, __FILE__, __LINE__); \ + } \ + } +} // namespace rocksdb +#endif + +#ifdef NDEBUG +#define TEST_SYNC_POINT(x) +#define TEST_IDX_SYNC_POINT(x, index) +#define TEST_SYNC_POINT_CALLBACK(x, y) +#define INIT_SYNC_POINT_SINGLETONS() +#else + +namespace rocksdb { + +// This class provides facility to reproduce race conditions deterministically +// in unit tests. +// Developer could specify sync points in the codebase via TEST_SYNC_POINT. +// Each sync point represents a position in the execution stream of a thread. +// In the unit test, 'Happens After' relationship among sync points could be +// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of +// threads execution. +// Refer to (DBTest,TransactionLogIteratorRace), for an example use case. + +class SyncPoint { + public: + static SyncPoint* GetInstance(); + + SyncPoint(const SyncPoint&) = delete; + SyncPoint& operator=(const SyncPoint&) = delete; + ~SyncPoint(); + + struct SyncPointPair { + std::string predecessor; + std::string successor; + }; + + // call once at the beginning of a test to setup the dependency between + // sync points + void LoadDependency(const std::vector<SyncPointPair>& dependencies); + + // call once at the beginning of a test to setup the dependency between + // sync points and setup markers indicating the successor is only enabled + // when it is processed on the same thread as the predecessor. + // When adding a marker, it implicitly adds a dependency for the marker pair. + void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies, + const std::vector<SyncPointPair>& markers); + + // The argument to the callback is passed through from + // TEST_SYNC_POINT_CALLBACK(); nullptr if TEST_SYNC_POINT or + // TEST_IDX_SYNC_POINT was used. + void SetCallBack(const std::string& point, + const std::function<void(void*)>& callback); + + // Clear callback function by point + void ClearCallBack(const std::string& point); + + // Clear all call back functions. + void ClearAllCallBacks(); + + // enable sync point processing (disabled on startup) + void EnableProcessing(); + + // disable sync point processing + void DisableProcessing(); + + // remove the execution trace of all sync points + void ClearTrace(); + + // triggered by TEST_SYNC_POINT, blocking execution until all predecessors + // are executed. + // And/or call registered callback function, with argument `cb_arg` + void Process(const std::string& point, void* cb_arg = nullptr); + + // TODO: it might be useful to provide a function that blocks until all + // sync points are cleared. + + // We want this to be public so we can + // subclass the implementation + struct Data; + + private: + // Singleton + SyncPoint(); + Data* impl_; +}; + +} // namespace rocksdb + +// Use TEST_SYNC_POINT to specify sync points inside code base. +// Sync points can have happens-after depedency on other sync points, +// configured at runtime via SyncPoint::LoadDependency. This could be +// utilized to re-produce race conditions between threads. +// See TransactionLogIteratorRace in db_test.cc for an example use case. +// TEST_SYNC_POINT is no op in release build. +#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x) +#define TEST_IDX_SYNC_POINT(x, index) \ + rocksdb::SyncPoint::GetInstance()->Process(x + std::to_string(index)) +#define TEST_SYNC_POINT_CALLBACK(x, y) \ + rocksdb::SyncPoint::GetInstance()->Process(x, y) +#define INIT_SYNC_POINT_SINGLETONS() \ + (void)rocksdb::SyncPoint::GetInstance(); +#endif // NDEBUG diff --git a/src/rocksdb/util/sync_point_impl.cc b/src/rocksdb/util/sync_point_impl.cc new file mode 100644 index 00000000..248c381a --- /dev/null +++ b/src/rocksdb/util/sync_point_impl.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/sync_point_impl.h" + +#ifndef NDEBUG +namespace rocksdb { + +void TestKillRandom(std::string kill_point, int odds, + const std::string& srcfile, int srcline) { + for (auto& p : rocksdb_kill_prefix_blacklist) { + if (kill_point.substr(0, p.length()) == p) { + return; + } + } + + assert(odds > 0); + if (odds % 7 == 0) { + // class Random uses multiplier 16807, which is 7^5. If odds are + // multiplier of 7, there might be limited values generated. + odds++; + } + auto* r = Random::GetTLSInstance(); + bool crash = r->OneIn(odds); + if (crash) { + port::Crash(srcfile, srcline); + } +} + + +void SyncPoint::Data::LoadDependency(const std::vector<SyncPointPair>& dependencies) { + std::lock_guard<std::mutex> lock(mutex_); + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + } + cv_.notify_all(); +} + +void SyncPoint::Data::LoadDependencyAndMarkers( + const std::vector<SyncPointPair>& dependencies, + const std::vector<SyncPointPair>& markers) { + std::lock_guard<std::mutex> lock(mutex_); + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + markers_.clear(); + marked_thread_id_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + } + for (const auto& marker : markers) { + successors_[marker.predecessor].push_back(marker.successor); + predecessors_[marker.successor].push_back(marker.predecessor); + markers_[marker.predecessor].push_back(marker.successor); + } + cv_.notify_all(); +} + +bool SyncPoint::Data::PredecessorsAllCleared(const std::string& point) { + for (const auto& pred : predecessors_[point]) { + if (cleared_points_.count(pred) == 0) { + return false; + } + } + return true; +} + +void SyncPoint::Data::ClearCallBack(const std::string& point) { + std::unique_lock<std::mutex> lock(mutex_); + while (num_callbacks_running_ > 0) { + cv_.wait(lock); + } + callbacks_.erase(point); +} + +void SyncPoint::Data::ClearAllCallBacks() { + std::unique_lock<std::mutex> lock(mutex_); + while (num_callbacks_running_ > 0) { + cv_.wait(lock); + } + callbacks_.clear(); +} + +void SyncPoint::Data::Process(const std::string& point, void* cb_arg) { + if (!enabled_) { + return; + } + + std::unique_lock<std::mutex> lock(mutex_); + auto thread_id = std::this_thread::get_id(); + + auto marker_iter = markers_.find(point); + if (marker_iter != markers_.end()) { + for (auto& marked_point : marker_iter->second) { + marked_thread_id_.emplace(marked_point, thread_id); + } + } + + if (DisabledByMarker(point, thread_id)) { + return; + } + + while (!PredecessorsAllCleared(point)) { + cv_.wait(lock); + if (DisabledByMarker(point, thread_id)) { + return; + } + } + + auto callback_pair = callbacks_.find(point); + if (callback_pair != callbacks_.end()) { + num_callbacks_running_++; + mutex_.unlock(); + callback_pair->second(cb_arg); + mutex_.lock(); + num_callbacks_running_--; + } + cleared_points_.insert(point); + cv_.notify_all(); +} +} // rocksdb +#endif diff --git a/src/rocksdb/util/sync_point_impl.h b/src/rocksdb/util/sync_point_impl.h new file mode 100644 index 00000000..3c7e7049 --- /dev/null +++ b/src/rocksdb/util/sync_point_impl.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/sync_point.h" + +#include <assert.h> +#include <atomic> +#include <condition_variable> +#include <functional> +#include <mutex> +#include <string> +#include <thread> +#include <unordered_map> +#include <unordered_set> + +#include "port/port.h" +#include "util/random.h" + +#pragma once + +#ifndef NDEBUG +namespace rocksdb { +struct SyncPoint::Data { + Data() : enabled_(false) {} + // Enable proper deletion by subclasses + virtual ~Data() {} + // successor/predecessor map loaded from LoadDependency + std::unordered_map<std::string, std::vector<std::string>> successors_; + std::unordered_map<std::string, std::vector<std::string>> predecessors_; + std::unordered_map<std::string, std::function<void(void*)> > callbacks_; + std::unordered_map<std::string, std::vector<std::string> > markers_; + std::unordered_map<std::string, std::thread::id> marked_thread_id_; + + std::mutex mutex_; + std::condition_variable cv_; + // sync points that have been passed through + std::unordered_set<std::string> cleared_points_; + std::atomic<bool> enabled_; + int num_callbacks_running_ = 0; + + void LoadDependency(const std::vector<SyncPointPair>& dependencies); + void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies, + const std::vector<SyncPointPair>& markers); + bool PredecessorsAllCleared(const std::string& point); + void SetCallBack(const std::string& point, + const std::function<void(void*)>& callback) { + std::lock_guard<std::mutex> lock(mutex_); + callbacks_[point] = callback; +} + + void ClearCallBack(const std::string& point); + void ClearAllCallBacks(); + void EnableProcessing() { + enabled_ = true; + } + void DisableProcessing() { + enabled_ = false; + } + void ClearTrace() { + std::lock_guard<std::mutex> lock(mutex_); + cleared_points_.clear(); + } + bool DisabledByMarker(const std::string& point, + std::thread::id thread_id) { + auto marked_point_iter = marked_thread_id_.find(point); + return marked_point_iter != marked_thread_id_.end() && + thread_id != marked_point_iter->second; + } + void Process(const std::string& point, void* cb_arg); +}; +} +#endif // NDEBUG diff --git a/src/rocksdb/util/testharness.cc b/src/rocksdb/util/testharness.cc new file mode 100644 index 00000000..8f5eb2a4 --- /dev/null +++ b/src/rocksdb/util/testharness.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" +#include <string> +#include <thread> + +namespace rocksdb { +namespace test { + +::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) { + if (s.ok()) { + return ::testing::AssertionSuccess(); + } else { + return ::testing::AssertionFailure() << s_expr << std::endl + << s.ToString(); + } +} + +std::string TmpDir(Env* env) { + std::string dir; + Status s = env->GetTestDirectory(&dir); + EXPECT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +std::string PerThreadDBPath(std::string dir, std::string name) { + size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id()); + return dir + "/" + name + "_" + std::to_string(tid); +} + +std::string PerThreadDBPath(std::string name) { + return PerThreadDBPath(test::TmpDir(), name); +} + +std::string PerThreadDBPath(Env* env, std::string name) { + return PerThreadDBPath(test::TmpDir(env), name); +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != nullptr ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} // namespace test +} // namespace rocksdb diff --git a/src/rocksdb/util/testharness.h b/src/rocksdb/util/testharness.h new file mode 100644 index 00000000..39e77f8a --- /dev/null +++ b/src/rocksdb/util/testharness.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifdef OS_AIX +#include "gtest/gtest.h" +#else +#include <gtest/gtest.h> +#endif + +#include <string> +#include "rocksdb/env.h" + +namespace rocksdb { +namespace test { + +// Return the directory to use for temporary storage. +std::string TmpDir(Env* env = Env::Default()); + +// A path unique within the thread +std::string PerThreadDBPath(std::string name); +std::string PerThreadDBPath(Env* env, std::string name); +std::string PerThreadDBPath(std::string dir, std::string name); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +int RandomSeed(); + +::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s); + +#define ASSERT_OK(s) ASSERT_PRED_FORMAT1(rocksdb::test::AssertStatus, s) +#define ASSERT_NOK(s) ASSERT_FALSE((s).ok()) +#define EXPECT_OK(s) EXPECT_PRED_FORMAT1(rocksdb::test::AssertStatus, s) +#define EXPECT_NOK(s) EXPECT_FALSE((s).ok()) + +} // namespace test +} // namespace rocksdb diff --git a/src/rocksdb/util/testutil.cc b/src/rocksdb/util/testutil.cc new file mode 100644 index 00000000..ec95d107 --- /dev/null +++ b/src/rocksdb/util/testutil.cc @@ -0,0 +1,417 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include <cctype> +#include <sstream> + +#include "db/memtable_list.h" +#include "port/port.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { +namespace test { + +const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version; +const uint32_t kLatestFormatVersion = 4u; + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +extern std::string RandomHumanReadableString(Random* rnd, int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; ++i) { + ret[i] = static_cast<char>('a' + rnd->Uniform(26)); + } + return ret; +} + +std::string RandomKey(Random* rnd, int len, RandomKeyType type) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = {'\0', '\1', 'a', 'b', 'c', + 'd', 'e', '\xfd', '\xfe', '\xff'}; + std::string result; + for (int i = 0; i < len; i++) { + std::size_t indx = 0; + switch (type) { + case RandomKeyType::RANDOM: + indx = rnd->Uniform(sizeof(kTestChars)); + break; + case RandomKeyType::LARGEST: + indx = sizeof(kTestChars) - 1; + break; + case RandomKeyType::MIDDLE: + indx = sizeof(kTestChars) / 2; + break; + case RandomKeyType::SMALLEST: + indx = 0; + break; + } + result += kTestChars[indx]; + } + return result; +} + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast<int>(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < (unsigned int)len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +namespace { +class Uint64ComparatorImpl : public Comparator { + public: + Uint64ComparatorImpl() {} + + const char* Name() const override { return "rocksdb.Uint64Comparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t)); + const uint64_t* left = reinterpret_cast<const uint64_t*>(a.data()); + const uint64_t* right = reinterpret_cast<const uint64_t*>(b.data()); + uint64_t leftValue; + uint64_t rightValue; + GetUnaligned(left, &leftValue); + GetUnaligned(right, &rightValue); + if (leftValue == rightValue) { + return 0; + } else if (leftValue < rightValue) { + return -1; + } else { + return 1; + } + } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override { + return; + } + + void FindShortSuccessor(std::string* /*key*/) const override { return; } +}; +} // namespace + +const Comparator* Uint64Comparator() { + static Uint64ComparatorImpl uint64comp; + return &uint64comp; +} + +WritableFileWriter* GetWritableFileWriter(WritableFile* wf, + const std::string& fname) { + std::unique_ptr<WritableFile> file(wf); + return new WritableFileWriter(std::move(file), fname, EnvOptions()); +} + +RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) { + std::unique_ptr<RandomAccessFile> file(raf); + return new RandomAccessFileReader(std::move(file), + "[test RandomAccessFileReader]"); +} + +SequentialFileReader* GetSequentialFileReader(SequentialFile* se, + const std::string& fname) { + std::unique_ptr<SequentialFile> file(se); + return new SequentialFileReader(std::move(file), fname); +} + +void CorruptKeyType(InternalKey* ikey) { + std::string keystr = ikey->Encode().ToString(); + keystr[keystr.size() - 8] = kTypeLogData; + ikey->DecodeFrom(Slice(keystr.data(), keystr.size())); +} + +std::string KeyStr(const std::string& user_key, const SequenceNumber& seq, + const ValueType& t, bool corrupt) { + InternalKey k(user_key, seq, t); + if (corrupt) { + CorruptKeyType(&k); + } + return k.Encode().ToString(); +} + +std::string RandomName(Random* rnd, const size_t len) { + std::stringstream ss; + for (size_t i = 0; i < len; ++i) { + ss << static_cast<char>(rnd->Uniform(26) + 'a'); + } + return ss.str(); +} + +CompressionType RandomCompressionType(Random* rnd) { + return static_cast<CompressionType>(rnd->Uniform(6)); +} + +void RandomCompressionTypeVector(const size_t count, + std::vector<CompressionType>* types, + Random* rnd) { + types->clear(); + for (size_t i = 0; i < count; ++i) { + types->emplace_back(RandomCompressionType(rnd)); + } +} + +const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined) { + int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4); + switch (random_num) { + case 0: + return NewFixedPrefixTransform(rnd->Uniform(20) + 1); + case 1: + return NewCappedPrefixTransform(rnd->Uniform(20) + 1); + case 2: + return NewNoopTransform(); + default: + return nullptr; + } +} + +BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { + BlockBasedTableOptions opt; + opt.cache_index_and_filter_blocks = rnd->Uniform(2); + opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2); + opt.pin_top_level_index_and_filter = rnd->Uniform(2); + opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch + : BlockBasedTableOptions::kHashSearch; + opt.hash_index_allow_collision = rnd->Uniform(2); + opt.checksum = static_cast<ChecksumType>(rnd->Uniform(3)); + opt.block_size = rnd->Uniform(10000000); + opt.block_size_deviation = rnd->Uniform(100); + opt.block_restart_interval = rnd->Uniform(100); + opt.index_block_restart_interval = rnd->Uniform(100); + opt.whole_key_filtering = rnd->Uniform(2); + + return opt; +} + +TableFactory* RandomTableFactory(Random* rnd, int pre_defined) { +#ifndef ROCKSDB_LITE + int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4); + switch (random_num) { + case 0: + return NewPlainTableFactory(); + case 1: + return NewCuckooTableFactory(); + default: + return NewBlockBasedTableFactory(); + } +#else + (void)rnd; + (void)pre_defined; + return NewBlockBasedTableFactory(); +#endif // !ROCKSDB_LITE +} + +MergeOperator* RandomMergeOperator(Random* rnd) { + return new ChanglingMergeOperator(RandomName(rnd, 10)); +} + +CompactionFilter* RandomCompactionFilter(Random* rnd) { + return new ChanglingCompactionFilter(RandomName(rnd, 10)); +} + +CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd) { + return new ChanglingCompactionFilterFactory(RandomName(rnd, 10)); +} + +void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) { + // boolean options + db_opt->advise_random_on_open = rnd->Uniform(2); + db_opt->allow_mmap_reads = rnd->Uniform(2); + db_opt->allow_mmap_writes = rnd->Uniform(2); + db_opt->use_direct_reads = rnd->Uniform(2); + db_opt->use_direct_io_for_flush_and_compaction = rnd->Uniform(2); + db_opt->create_if_missing = rnd->Uniform(2); + db_opt->create_missing_column_families = rnd->Uniform(2); + db_opt->enable_thread_tracking = rnd->Uniform(2); + db_opt->error_if_exists = rnd->Uniform(2); + db_opt->is_fd_close_on_exec = rnd->Uniform(2); + db_opt->paranoid_checks = rnd->Uniform(2); + db_opt->skip_log_error_on_recovery = rnd->Uniform(2); + db_opt->skip_stats_update_on_db_open = rnd->Uniform(2); + db_opt->use_adaptive_mutex = rnd->Uniform(2); + db_opt->use_fsync = rnd->Uniform(2); + db_opt->recycle_log_file_num = rnd->Uniform(2); + db_opt->avoid_flush_during_recovery = rnd->Uniform(2); + db_opt->avoid_flush_during_shutdown = rnd->Uniform(2); + + // int options + db_opt->max_background_compactions = rnd->Uniform(100); + db_opt->max_background_flushes = rnd->Uniform(100); + db_opt->max_file_opening_threads = rnd->Uniform(100); + db_opt->max_open_files = rnd->Uniform(100); + db_opt->table_cache_numshardbits = rnd->Uniform(100); + + // size_t options + db_opt->db_write_buffer_size = rnd->Uniform(10000); + db_opt->keep_log_file_num = rnd->Uniform(10000); + db_opt->log_file_time_to_roll = rnd->Uniform(10000); + db_opt->manifest_preallocation_size = rnd->Uniform(10000); + db_opt->max_log_file_size = rnd->Uniform(10000); + + // std::string options + db_opt->db_log_dir = "path/to/db_log_dir"; + db_opt->wal_dir = "path/to/wal_dir"; + + // uint32_t options + db_opt->max_subcompactions = rnd->Uniform(100000); + + // uint64_t options + static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX); + db_opt->WAL_size_limit_MB = uint_max + rnd->Uniform(100000); + db_opt->WAL_ttl_seconds = uint_max + rnd->Uniform(100000); + db_opt->bytes_per_sync = uint_max + rnd->Uniform(100000); + db_opt->delayed_write_rate = uint_max + rnd->Uniform(100000); + db_opt->delete_obsolete_files_period_micros = uint_max + rnd->Uniform(100000); + db_opt->max_manifest_file_size = uint_max + rnd->Uniform(100000); + db_opt->max_total_wal_size = uint_max + rnd->Uniform(100000); + db_opt->wal_bytes_per_sync = uint_max + rnd->Uniform(100000); + + // unsigned int options + db_opt->stats_dump_period_sec = rnd->Uniform(100000); +} + +void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { + cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4)); + + // boolean options + cf_opt->report_bg_io_stats = rnd->Uniform(2); + cf_opt->disable_auto_compactions = rnd->Uniform(2); + cf_opt->inplace_update_support = rnd->Uniform(2); + cf_opt->level_compaction_dynamic_level_bytes = rnd->Uniform(2); + cf_opt->optimize_filters_for_hits = rnd->Uniform(2); + cf_opt->paranoid_file_checks = rnd->Uniform(2); + cf_opt->purge_redundant_kvs_while_flush = rnd->Uniform(2); + cf_opt->force_consistency_checks = rnd->Uniform(2); + cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2); + cf_opt->memtable_whole_key_filtering = rnd->Uniform(2); + + // double options + cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13; + cf_opt->soft_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13; + cf_opt->memtable_prefix_bloom_size_ratio = + static_cast<double>(rnd->Uniform(10000)) / 20000.0; + + // int options + cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100); + cf_opt->level0_slowdown_writes_trigger = rnd->Uniform(100); + cf_opt->level0_stop_writes_trigger = rnd->Uniform(100); + cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100); + cf_opt->max_mem_compaction_level = rnd->Uniform(100); + cf_opt->max_write_buffer_number = rnd->Uniform(100); + cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100); + cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100); + cf_opt->num_levels = rnd->Uniform(100); + cf_opt->target_file_size_multiplier = rnd->Uniform(100); + + // vector int options + cf_opt->max_bytes_for_level_multiplier_additional.resize(cf_opt->num_levels); + for (int i = 0; i < cf_opt->num_levels; i++) { + cf_opt->max_bytes_for_level_multiplier_additional[i] = rnd->Uniform(100); + } + + // size_t options + cf_opt->arena_block_size = rnd->Uniform(10000); + cf_opt->inplace_update_num_locks = rnd->Uniform(10000); + cf_opt->max_successive_merges = rnd->Uniform(10000); + cf_opt->memtable_huge_page_size = rnd->Uniform(10000); + cf_opt->write_buffer_size = rnd->Uniform(10000); + + // uint32_t options + cf_opt->bloom_locality = rnd->Uniform(10000); + cf_opt->max_bytes_for_level_base = rnd->Uniform(10000); + + // uint64_t options + static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX); + cf_opt->ttl = uint_max + rnd->Uniform(10000); + cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000); + cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000); + cf_opt->max_compaction_bytes = + cf_opt->target_file_size_base * rnd->Uniform(100); + cf_opt->compaction_options_fifo.max_table_files_size = + uint_max + rnd->Uniform(10000); + + // unsigned int options + cf_opt->rate_limit_delay_max_milliseconds = rnd->Uniform(10000); + + // pointer typed options + cf_opt->prefix_extractor.reset(RandomSliceTransform(rnd)); + cf_opt->table_factory.reset(RandomTableFactory(rnd)); + cf_opt->merge_operator.reset(RandomMergeOperator(rnd)); + if (cf_opt->compaction_filter) { + delete cf_opt->compaction_filter; + } + cf_opt->compaction_filter = RandomCompactionFilter(rnd); + cf_opt->compaction_filter_factory.reset(RandomCompactionFilterFactory(rnd)); + + // custom typed options + cf_opt->compression = RandomCompressionType(rnd); + RandomCompressionTypeVector(cf_opt->num_levels, + &cf_opt->compression_per_level, rnd); +} + +Status DestroyDir(Env* env, const std::string& dir) { + Status s; + if (env->FileExists(dir).IsNotFound()) { + return s; + } + std::vector<std::string> files_in_dir; + s = env->GetChildren(dir, &files_in_dir); + if (s.ok()) { + for (auto& file_in_dir : files_in_dir) { + if (file_in_dir == "." || file_in_dir == "..") { + continue; + } + s = env->DeleteFile(dir + "/" + file_in_dir); + if (!s.ok()) { + break; + } + } + } + + if (s.ok()) { + s = env->DeleteDir(dir); + } + return s; +} + +bool IsDirectIOSupported(Env* env, const std::string& dir) { + EnvOptions env_options; + env_options.use_mmap_writes = false; + env_options.use_direct_writes = true; + std::string tmp = TempFileName(dir, 999); + Status s; + { + std::unique_ptr<WritableFile> file; + s = env->NewWritableFile(tmp, &file, env_options); + } + if (s.ok()) { + s = env->DeleteFile(tmp); + } + return s.ok(); +} + +} // namespace test +} // namespace rocksdb diff --git a/src/rocksdb/util/testutil.h b/src/rocksdb/util/testutil.h new file mode 100644 index 00000000..2aab3df7 --- /dev/null +++ b/src/rocksdb/util/testutil.h @@ -0,0 +1,754 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <algorithm> +#include <deque> +#include <string> +#include <vector> + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "table/block_based_table_factory.h" +#include "table/internal_iterator.h" +#include "table/plain_table_factory.h" +#include "util/mutexlock.h" +#include "util/random.h" + +namespace rocksdb { +class SequentialFile; +class SequentialFileReader; + +namespace test { + +extern const uint32_t kDefaultFormatVersion; +extern const uint32_t kLatestFormatVersion; + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +extern std::string RandomHumanReadableString(Random* rnd, int len); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +enum RandomKeyType : char { RANDOM, LARGEST, SMALLEST, MIDDLE }; +extern std::string RandomKey(Random* rnd, int len, + RandomKeyType type = RandomKeyType::RANDOM); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& soptions) override { + result->reset(); + if (writable_file_error_) { + ++num_writable_file_errors_; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result, soptions); + } +}; + +#ifndef NDEBUG +// An internal comparator that just forward comparing results from the +// user comparator in it. Can be used to test entities that have no dependency +// on internal key structure but consumes InternalKeyComparator, like +// BlockBasedTable. +class PlainInternalKeyComparator : public InternalKeyComparator { + public: + explicit PlainInternalKeyComparator(const Comparator* c) + : InternalKeyComparator(c) {} + + virtual ~PlainInternalKeyComparator() {} + + virtual int Compare(const Slice& a, const Slice& b) const override { + return user_comparator()->Compare(a, b); + } +}; +#endif + +// A test comparator which compare two strings in this way: +// (1) first compare prefix of 8 bytes in alphabet order, +// (2) if two strings share the same prefix, sort the other part of the string +// in the reverse alphabet order. +// This helps simulate the case of compounded key of [entity][timestamp] and +// latest timestamp first. +class SimpleSuffixReverseComparator : public Comparator { + public: + SimpleSuffixReverseComparator() {} + + virtual const char* Name() const override { + return "SimpleSuffixReverseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const override { + Slice prefix_a = Slice(a.data(), 8); + Slice prefix_b = Slice(b.data(), 8); + int prefix_comp = prefix_a.compare(prefix_b); + if (prefix_comp != 0) { + return prefix_comp; + } else { + Slice suffix_a = Slice(a.data() + 8, a.size() - 8); + Slice suffix_b = Slice(b.data() + 8, b.size() - 8); + return -(suffix_a.compare(suffix_b)); + } + } + virtual void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + virtual void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +// Returns a user key comparator that can be used for comparing two uint64_t +// slices. Instead of comparing slices byte-wise, it compares all the 8 bytes +// at once. Assumes same endian-ness is used though the database's lifetime. +// Symantics of comparison would differ from Bytewise comparator in little +// endian machines. +extern const Comparator* Uint64Comparator(); + +// Iterator over a vector of keys/values +class VectorIterator : public InternalIterator { + public: + explicit VectorIterator(const std::vector<std::string>& keys) + : keys_(keys), current_(keys.size()) { + std::sort(keys_.begin(), keys_.end()); + values_.resize(keys.size()); + } + + VectorIterator(const std::vector<std::string>& keys, + const std::vector<std::string>& values) + : keys_(keys), values_(values), current_(keys.size()) { + assert(keys_.size() == values_.size()); + } + + virtual bool Valid() const override { return current_ < keys_.size(); } + + virtual void SeekToFirst() override { current_ = 0; } + virtual void SeekToLast() override { current_ = keys_.size() - 1; } + + virtual void Seek(const Slice& target) override { + current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + + virtual void SeekForPrev(const Slice& target) override { + current_ = std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + if (!Valid()) { + SeekToLast(); + } else { + Prev(); + } + } + + virtual void Next() override { current_++; } + virtual void Prev() override { current_--; } + + virtual Slice key() const override { return Slice(keys_[current_]); } + virtual Slice value() const override { return Slice(values_[current_]); } + + virtual Status status() const override { return Status::OK(); } + + virtual bool IsKeyPinned() const override { return true; } + virtual bool IsValuePinned() const override { return true; } + + private: + std::vector<std::string> keys_; + std::vector<std::string> values_; + size_t current_; +}; +extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf, + const std::string& fname); + +extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf); + +extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se, + const std::string& fname); + +class StringSink: public WritableFile { + public: + std::string contents_; + + explicit StringSink(Slice* reader_contents = nullptr) : + WritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) { + if (reader_contents_ != nullptr) { + *reader_contents_ = Slice(contents_.data(), 0); + } + } + + const std::string& contents() const { return contents_; } + + virtual Status Truncate(uint64_t size) override { + contents_.resize(static_cast<size_t>(size)); + return Status::OK(); + } + virtual Status Close() override { return Status::OK(); } + virtual Status Flush() override { + if (reader_contents_ != nullptr) { + assert(reader_contents_->size() <= last_flush_); + size_t offset = last_flush_ - reader_contents_->size(); + *reader_contents_ = Slice( + contents_.data() + offset, + contents_.size() - offset); + last_flush_ = contents_.size(); + } + + return Status::OK(); + } + virtual Status Sync() override { return Status::OK(); } + virtual Status Append(const Slice& slice) override { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + void Drop(size_t bytes) { + if (reader_contents_ != nullptr) { + contents_.resize(contents_.size() - bytes); + *reader_contents_ = Slice( + reader_contents_->data(), reader_contents_->size() - bytes); + last_flush_ = contents_.size(); + } + } + + private: + Slice* reader_contents_; + size_t last_flush_; +}; + +// A wrapper around a StringSink to give it a RandomRWFile interface +class RandomRWStringSink : public RandomRWFile { + public: + explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {} + + Status Write(uint64_t offset, const Slice& data) override { + if (offset + data.size() > ss_->contents_.size()) { + ss_->contents_.resize(static_cast<size_t>(offset) + data.size(), '\0'); + } + + char* pos = const_cast<char*>(ss_->contents_.data() + offset); + memcpy(pos, data.data(), data.size()); + return Status::OK(); + } + + Status Read(uint64_t offset, size_t n, Slice* result, + char* /*scratch*/) const override { + *result = Slice(nullptr, 0); + if (offset < ss_->contents_.size()) { + size_t str_res_sz = + std::min(static_cast<size_t>(ss_->contents_.size() - offset), n); + *result = Slice(ss_->contents_.data() + offset, str_res_sz); + } + return Status::OK(); + } + + Status Flush() override { return Status::OK(); } + + Status Sync() override { return Status::OK(); } + + Status Close() override { return Status::OK(); } + + const std::string& contents() const { return ss_->contents(); } + + private: + StringSink* ss_; +}; + +// Like StringSink, this writes into a string. Unlink StringSink, it +// has some initial content and overwrites it, just like a recycled +// log file. +class OverwritingStringSink : public WritableFile { + public: + explicit OverwritingStringSink(Slice* reader_contents) + : WritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) {} + + const std::string& contents() const { return contents_; } + + virtual Status Truncate(uint64_t size) override { + contents_.resize(static_cast<size_t>(size)); + return Status::OK(); + } + virtual Status Close() override { return Status::OK(); } + virtual Status Flush() override { + if (last_flush_ < contents_.size()) { + assert(reader_contents_->size() >= contents_.size()); + memcpy((char*)reader_contents_->data() + last_flush_, + contents_.data() + last_flush_, contents_.size() - last_flush_); + last_flush_ = contents_.size(); + } + return Status::OK(); + } + virtual Status Sync() override { return Status::OK(); } + virtual Status Append(const Slice& slice) override { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + void Drop(size_t bytes) { + contents_.resize(contents_.size() - bytes); + if (last_flush_ > contents_.size()) last_flush_ = contents_.size(); + } + + private: + std::string contents_; + Slice* reader_contents_; + size_t last_flush_; +}; + +class StringSource: public RandomAccessFile { + public: + explicit StringSource(const Slice& contents, uint64_t uniq_id = 0, + bool mmap = false) + : contents_(contents.data(), contents.size()), + uniq_id_(uniq_id), + mmap_(mmap), + total_reads_(0) {} + + virtual ~StringSource() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + total_reads_++; + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - static_cast<size_t>(offset); + } + if (!mmap_) { + memcpy(scratch, &contents_[static_cast<size_t>(offset)], n); + *result = Slice(scratch, n); + } else { + *result = Slice(&contents_[static_cast<size_t>(offset)], n); + } + return Status::OK(); + } + + virtual size_t GetUniqueId(char* id, size_t max_size) const override { + if (max_size < 20) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, uniq_id_); + rid = EncodeVarint64(rid, 0); + return static_cast<size_t>(rid-id); + } + + int total_reads() const { return total_reads_; } + + void set_total_reads(int tr) { total_reads_ = tr; } + + private: + std::string contents_; + uint64_t uniq_id_; + bool mmap_; + mutable int total_reads_; +}; + +class NullLogger : public Logger { + public: + using Logger::Logv; + virtual void Logv(const char* /*format*/, va_list /*ap*/) override {} + virtual size_t GetLogFileSize() const override { return 0; } +}; + +// Corrupts key by changing the type +extern void CorruptKeyType(InternalKey* ikey); + +extern std::string KeyStr(const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt = false); + +class SleepingBackgroundTask { + public: + SleepingBackgroundTask() + : bg_cv_(&mutex_), + should_sleep_(true), + done_with_sleep_(false), + sleeping_(false) {} + + bool IsSleeping() { + MutexLock l(&mutex_); + return sleeping_; + } + void DoSleep() { + MutexLock l(&mutex_); + sleeping_ = true; + bg_cv_.SignalAll(); + while (should_sleep_) { + bg_cv_.Wait(); + } + sleeping_ = false; + done_with_sleep_ = true; + bg_cv_.SignalAll(); + } + void WaitUntilSleeping() { + MutexLock l(&mutex_); + while (!sleeping_ || !should_sleep_) { + bg_cv_.Wait(); + } + } + void WakeUp() { + MutexLock l(&mutex_); + should_sleep_ = false; + bg_cv_.SignalAll(); + } + void WaitUntilDone() { + MutexLock l(&mutex_); + while (!done_with_sleep_) { + bg_cv_.Wait(); + } + } + bool WokenUp() { + MutexLock l(&mutex_); + return should_sleep_ == false; + } + + void Reset() { + MutexLock l(&mutex_); + should_sleep_ = true; + done_with_sleep_ = false; + } + + static void DoSleepTask(void* arg) { + reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep(); + } + + private: + port::Mutex mutex_; + port::CondVar bg_cv_; // Signalled when background work finishes + bool should_sleep_; + bool done_with_sleep_; + bool sleeping_; +}; + +// Filters merge operands and values that are equal to `num`. +class FilterNumber : public CompactionFilter { + public: + explicit FilterNumber(uint64_t num) : num_(num) {} + + std::string last_merge_operand_key() { return last_merge_operand_key_; } + + bool Filter(int /*level*/, const rocksdb::Slice& /*key*/, + const rocksdb::Slice& value, std::string* /*new_value*/, + bool* /*value_changed*/) const override { + if (value.size() == sizeof(uint64_t)) { + return num_ == DecodeFixed64(value.data()); + } + return true; + } + + bool FilterMergeOperand(int /*level*/, const rocksdb::Slice& key, + const rocksdb::Slice& value) const override { + last_merge_operand_key_ = key.ToString(); + if (value.size() == sizeof(uint64_t)) { + return num_ == DecodeFixed64(value.data()); + } + return true; + } + + const char* Name() const override { return "FilterBadMergeOperand"; } + + private: + mutable std::string last_merge_operand_key_; + uint64_t num_; +}; + +inline std::string EncodeInt(uint64_t x) { + std::string result; + PutFixed64(&result, x); + return result; +} + +class StringEnv : public EnvWrapper { + public: + class SeqStringSource : public SequentialFile { + public: + explicit SeqStringSource(const std::string& data) + : data_(data), offset_(0) {} + ~SeqStringSource() {} + Status Read(size_t n, Slice* result, char* scratch) override { + std::string output; + if (offset_ < data_.size()) { + n = std::min(data_.size() - offset_, n); + memcpy(scratch, data_.data() + offset_, n); + offset_ += n; + *result = Slice(scratch, n); + } else { + return Status::InvalidArgument( + "Attemp to read when it already reached eof."); + } + return Status::OK(); + } + Status Skip(uint64_t n) override { + if (offset_ >= data_.size()) { + return Status::InvalidArgument( + "Attemp to read when it already reached eof."); + } + // TODO(yhchiang): Currently doesn't handle the overflow case. + offset_ += static_cast<size_t>(n); + return Status::OK(); + } + + private: + std::string data_; + size_t offset_; + }; + + class StringSink : public WritableFile { + public: + explicit StringSink(std::string* contents) + : WritableFile(), contents_(contents) {} + virtual Status Truncate(uint64_t size) override { + contents_->resize(static_cast<size_t>(size)); + return Status::OK(); + } + virtual Status Close() override { return Status::OK(); } + virtual Status Flush() override { return Status::OK(); } + virtual Status Sync() override { return Status::OK(); } + virtual Status Append(const Slice& slice) override { + contents_->append(slice.data(), slice.size()); + return Status::OK(); + } + + private: + std::string* contents_; + }; + + explicit StringEnv(Env* t) : EnvWrapper(t) {} + virtual ~StringEnv() {} + + const std::string& GetContent(const std::string& f) { return files_[f]; } + + const Status WriteToNewFile(const std::string& file_name, + const std::string& content) { + std::unique_ptr<WritableFile> r; + auto s = NewWritableFile(file_name, &r, EnvOptions()); + if (!s.ok()) { + return s; + } + r->Append(content); + r->Flush(); + r->Close(); + assert(files_[file_name] == content); + return Status::OK(); + } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + std::unique_ptr<SequentialFile>* r, + const EnvOptions& /*options*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return Status::NotFound("The specified file does not exist", f); + } + r->reset(new SeqStringSource(iter->second)); + return Status::OK(); + } + Status NewRandomAccessFile(const std::string& /*f*/, + std::unique_ptr<RandomAccessFile>* /*r*/, + const EnvOptions& /*options*/) override { + return Status::NotSupported(); + } + Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r, + const EnvOptions& /*options*/) override { + auto iter = files_.find(f); + if (iter != files_.end()) { + return Status::IOError("The specified file already exists", f); + } + r->reset(new StringSink(&files_[f])); + return Status::OK(); + } + virtual Status NewDirectory(const std::string& /*name*/, + std::unique_ptr<Directory>* /*result*/) override { + return Status::NotSupported(); + } + Status FileExists(const std::string& f) override { + if (files_.find(f) == files_.end()) { + return Status::NotFound(); + } + return Status::OK(); + } + Status GetChildren(const std::string& /*dir*/, + std::vector<std::string>* /*r*/) override { + return Status::NotSupported(); + } + Status DeleteFile(const std::string& f) override { + files_.erase(f); + return Status::OK(); + } + Status CreateDir(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status CreateDirIfMissing(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status DeleteDir(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return Status::NotFound("The specified file does not exist:", f); + } + *s = iter->second.size(); + return Status::OK(); + } + + Status GetFileModificationTime(const std::string& /*fname*/, + uint64_t* /*file_mtime*/) override { + return Status::NotSupported(); + } + + Status RenameFile(const std::string& /*s*/, + const std::string& /*t*/) override { + return Status::NotSupported(); + } + + Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override { + return Status::NotSupported(); + } + + Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { + return Status::NotSupported(); + } + + Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); } + + protected: + std::unordered_map<std::string, std::string> files_; +}; + +// Randomly initialize the given DBOptions +void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); + +// Randomly initialize the given ColumnFamilyOptions +// Note that the caller is responsible for releasing non-null +// cf_opt->compaction_filter. +void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd); + +// A dummy merge operator which can change its name +class ChanglingMergeOperator : public MergeOperator { + public: + explicit ChanglingMergeOperator(const std::string& name) + : name_(name + "MergeOperator") {} + ~ChanglingMergeOperator() {} + + void SetName(const std::string& name) { name_ = name; } + + virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + return false; + } + virtual bool PartialMergeMulti(const Slice& /*key*/, + const std::deque<Slice>& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + return false; + } + virtual const char* Name() const override { return name_.c_str(); } + + protected: + std::string name_; +}; + +// Returns a dummy merge operator with random name. +MergeOperator* RandomMergeOperator(Random* rnd); + +// A dummy compaction filter which can change its name +class ChanglingCompactionFilter : public CompactionFilter { + public: + explicit ChanglingCompactionFilter(const std::string& name) + : name_(name + "CompactionFilter") {} + ~ChanglingCompactionFilter() {} + + void SetName(const std::string& name) { name_ = name; } + + bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; +}; + +// Returns a dummy compaction filter with a random name. +CompactionFilter* RandomCompactionFilter(Random* rnd); + +// A dummy compaction filter factory which can change its name +class ChanglingCompactionFilterFactory : public CompactionFilterFactory { + public: + explicit ChanglingCompactionFilterFactory(const std::string& name) + : name_(name + "CompactionFilterFactory") {} + ~ChanglingCompactionFilterFactory() {} + + void SetName(const std::string& name) { name_ = name; } + + std::unique_ptr<CompactionFilter> CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr<CompactionFilter>(); + } + + // Returns a name that identifies this compaction filter factory. + const char* Name() const override { return name_.c_str(); } + + protected: + std::string name_; +}; + +CompressionType RandomCompressionType(Random* rnd); + +void RandomCompressionTypeVector(const size_t count, + std::vector<CompressionType>* types, + Random* rnd); + +CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd); + +const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined = -1); + +TableFactory* RandomTableFactory(Random* rnd, int pre_defined = -1); + +std::string RandomName(Random* rnd, const size_t len); + +Status DestroyDir(Env* env, const std::string& dir); + +bool IsDirectIOSupported(Env* env, const std::string& dir); + +} // namespace test +} // namespace rocksdb diff --git a/src/rocksdb/util/thread_list_test.cc b/src/rocksdb/util/thread_list_test.cc new file mode 100644 index 00000000..a4a343a9 --- /dev/null +++ b/src/rocksdb/util/thread_list_test.cc @@ -0,0 +1,352 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <mutex> +#include <condition_variable> + +#include "monitoring/thread_status_updater.h" +#include "rocksdb/db.h" +#include "util/testharness.h" + +#ifdef ROCKSDB_USING_THREAD_STATUS + +namespace rocksdb { + +class SimulatedBackgroundTask { + public: + SimulatedBackgroundTask( + const void* db_key, const std::string& db_name, + const void* cf_key, const std::string& cf_name, + const ThreadStatus::OperationType operation_type = + ThreadStatus::OP_UNKNOWN, + const ThreadStatus::StateType state_type = + ThreadStatus::STATE_UNKNOWN) + : db_key_(db_key), db_name_(db_name), + cf_key_(cf_key), cf_name_(cf_name), + operation_type_(operation_type), state_type_(state_type), + should_run_(true), running_count_(0) { + Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo( + db_key_, db_name_, cf_key_, cf_name_); + } + + ~SimulatedBackgroundTask() { + Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_); + } + + void Run() { + std::unique_lock<std::mutex> l(mutex_); + running_count_++; + Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_); + Env::Default()->GetThreadStatusUpdater()->SetThreadOperation( + operation_type_); + Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_); + while (should_run_) { + bg_cv_.wait(l); + } + Env::Default()->GetThreadStatusUpdater()->ClearThreadState(); + Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation(); + Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(nullptr); + running_count_--; + bg_cv_.notify_all(); + } + + void FinishAllTasks() { + std::unique_lock<std::mutex> l(mutex_); + should_run_ = false; + bg_cv_.notify_all(); + } + + void WaitUntilScheduled(int job_count, Env* env) { + while (running_count_ < job_count) { + env->SleepForMicroseconds(1000); + } + } + + void WaitUntilDone() { + std::unique_lock<std::mutex> l(mutex_); + while (running_count_ > 0) { + bg_cv_.wait(l); + } + } + + static void DoSimulatedTask(void* arg) { + reinterpret_cast<SimulatedBackgroundTask*>(arg)->Run(); + } + + private: + const void* db_key_; + const std::string db_name_; + const void* cf_key_; + const std::string cf_name_; + const ThreadStatus::OperationType operation_type_; + const ThreadStatus::StateType state_type_; + std::mutex mutex_; + std::condition_variable bg_cv_; + bool should_run_; + std::atomic<int> running_count_; +}; + +class ThreadListTest : public testing::Test { + public: + ThreadListTest() { + } +}; + +TEST_F(ThreadListTest, GlobalTables) { + // verify the global tables for operations and states are properly indexed. + for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) { + ASSERT_EQ(global_operation_table[type].type, type); + ASSERT_EQ(global_operation_table[type].name, + ThreadStatus::GetOperationName( + ThreadStatus::OperationType(type))); + } + + for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) { + ASSERT_EQ(global_state_table[type].type, type); + ASSERT_EQ(global_state_table[type].name, + ThreadStatus::GetStateName( + ThreadStatus::StateType(type))); + } + + for (int stage = 0; stage != ThreadStatus::NUM_OP_STAGES; ++stage) { + ASSERT_EQ(global_op_stage_table[stage].stage, stage); + ASSERT_EQ(global_op_stage_table[stage].name, + ThreadStatus::GetOperationStageName( + ThreadStatus::OperationStage(stage))); + } +} + +TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { + Env* env = Env::Default(); + const int kHighPriorityThreads = 3; + const int kLowPriorityThreads = 5; + const int kSimulatedHighPriThreads = kHighPriorityThreads - 1; + const int kSimulatedLowPriThreads = kLowPriorityThreads / 3; + env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH); + env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW); + + SimulatedBackgroundTask running_task( + reinterpret_cast<void*>(1234), "running", + reinterpret_cast<void*>(5678), "pikachu"); + + for (int test = 0; test < kSimulatedHighPriThreads; ++test) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &running_task, Env::Priority::HIGH); + } + for (int test = 0; test < kSimulatedLowPriThreads; ++test) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &running_task, Env::Priority::LOW); + } + running_task.WaitUntilScheduled( + kSimulatedHighPriThreads + kSimulatedLowPriThreads, env); + + std::vector<ThreadStatus> thread_list; + + // Verify the number of running threads in each pool. + env->GetThreadList(&thread_list); + int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0}; + for (auto thread_status : thread_list) { + if (thread_status.cf_name == "pikachu" && + thread_status.db_name == "running") { + running_count[thread_status.thread_type]++; + } + } + ASSERT_EQ( + running_count[ThreadStatus::HIGH_PRIORITY], + kSimulatedHighPriThreads); + ASSERT_EQ( + running_count[ThreadStatus::LOW_PRIORITY], + kSimulatedLowPriThreads); + ASSERT_EQ( + running_count[ThreadStatus::USER], 0); + + running_task.FinishAllTasks(); + running_task.WaitUntilDone(); + + // Verify none of the threads are running + env->GetThreadList(&thread_list); + + for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) { + running_count[i] = 0; + } + for (auto thread_status : thread_list) { + if (thread_status.cf_name == "pikachu" && + thread_status.db_name == "running") { + running_count[thread_status.thread_type]++; + } + } + + ASSERT_EQ( + running_count[ThreadStatus::HIGH_PRIORITY], 0); + ASSERT_EQ( + running_count[ThreadStatus::LOW_PRIORITY], 0); + ASSERT_EQ( + running_count[ThreadStatus::USER], 0); +} + +namespace { + void UpdateStatusCounts( + const std::vector<ThreadStatus>& thread_list, + int operation_counts[], int state_counts[]) { + for (auto thread_status : thread_list) { + operation_counts[thread_status.operation_type]++; + state_counts[thread_status.state_type]++; + } + } + + void VerifyAndResetCounts( + const int correct_counts[], int collected_counts[], int size) { + for (int i = 0; i < size; ++i) { + ASSERT_EQ(collected_counts[i], correct_counts[i]); + collected_counts[i] = 0; + } + } + + void UpdateCount( + int operation_counts[], int from_event, int to_event, int amount) { + operation_counts[from_event] -= amount; + operation_counts[to_event] += amount; + } +} // namespace + +TEST_F(ThreadListTest, SimpleEventTest) { + Env* env = Env::Default(); + + // simulated tasks + const int kFlushWriteTasks = 3; + SimulatedBackgroundTask flush_write_task( + reinterpret_cast<void*>(1234), "running", + reinterpret_cast<void*>(5678), "pikachu", + ThreadStatus::OP_FLUSH); + + const int kCompactionWriteTasks = 4; + SimulatedBackgroundTask compaction_write_task( + reinterpret_cast<void*>(1234), "running", + reinterpret_cast<void*>(5678), "pikachu", + ThreadStatus::OP_COMPACTION); + + const int kCompactionReadTasks = 5; + SimulatedBackgroundTask compaction_read_task( + reinterpret_cast<void*>(1234), "running", + reinterpret_cast<void*>(5678), "pikachu", + ThreadStatus::OP_COMPACTION); + + const int kCompactionWaitTasks = 6; + SimulatedBackgroundTask compaction_wait_task( + reinterpret_cast<void*>(1234), "running", + reinterpret_cast<void*>(5678), "pikachu", + ThreadStatus::OP_COMPACTION); + + // setup right answers + int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0}; + correct_operation_counts[ThreadStatus::OP_FLUSH] = + kFlushWriteTasks; + correct_operation_counts[ThreadStatus::OP_COMPACTION] = + kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks; + + env->SetBackgroundThreads( + correct_operation_counts[ThreadStatus::OP_FLUSH], Env::HIGH); + env->SetBackgroundThreads( + correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW); + + // schedule the simulated tasks + for (int t = 0; t < kFlushWriteTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &flush_write_task, Env::Priority::HIGH); + } + flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env); + + for (int t = 0; t < kCompactionWriteTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_write_task, Env::Priority::LOW); + } + compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env); + + for (int t = 0; t < kCompactionReadTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_read_task, Env::Priority::LOW); + } + compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env); + + for (int t = 0; t < kCompactionWaitTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_wait_task, Env::Priority::LOW); + } + compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env); + + // verify the thread-status + int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0}; + int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0}; + + std::vector<ThreadStatus> thread_list; + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-wait tasks and see if the thread-status + // reflects this update + compaction_wait_task.FinishAllTasks(); + compaction_wait_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate flush-write tasks and see if the thread-status + // reflects this update + flush_write_task.FinishAllTasks(); + flush_write_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH, + ThreadStatus::OP_UNKNOWN, kFlushWriteTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-write tasks and see if the thread-status + // reflects this update + compaction_write_task.FinishAllTasks(); + compaction_write_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-write tasks and see if the thread-status + // reflects this update + compaction_read_task.FinishAllTasks(); + compaction_read_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionReadTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return 0; +} + +#endif // ROCKSDB_USING_THREAD_STATUS diff --git a/src/rocksdb/util/thread_local.cc b/src/rocksdb/util/thread_local.cc new file mode 100644 index 00000000..7346eff1 --- /dev/null +++ b/src/rocksdb/util/thread_local.cc @@ -0,0 +1,554 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/thread_local.h" +#include "util/mutexlock.h" +#include "port/likely.h" +#include <stdlib.h> + +namespace rocksdb { + +struct Entry { + Entry() : ptr(nullptr) {} + Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {} + std::atomic<void*> ptr; +}; + +class StaticMeta; + +// This is the structure that is declared as "thread_local" storage. +// The vector keep list of atomic pointer for all instances for "current" +// thread. The vector is indexed by an Id that is unique in process and +// associated with one ThreadLocalPtr instance. The Id is assigned by a +// global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr +// instances, each thread will have a ThreadData with a vector of size 3: +// --------------------------------------------------- +// | | instance 1 | instance 2 | instnace 3 | +// --------------------------------------------------- +// | thread 1 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +// | thread 2 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +// | thread 3 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +struct ThreadData { + explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst) + : entries(), + next(nullptr), + prev(nullptr), + inst(_inst) {} + std::vector<Entry> entries; + ThreadData* next; + ThreadData* prev; + ThreadLocalPtr::StaticMeta* inst; +}; + +class ThreadLocalPtr::StaticMeta { +public: + StaticMeta(); + + // Return the next available Id + uint32_t GetId(); + // Return the next available Id without claiming it + uint32_t PeekId() const; + // Return the given Id back to the free pool. This also triggers + // UnrefHandler for associated pointer value (if not NULL) for all threads. + void ReclaimId(uint32_t id); + + // Return the pointer value for the given id for the current thread. + void* Get(uint32_t id) const; + // Reset the pointer value for the given id for the current thread. + void Reset(uint32_t id, void* ptr); + // Atomically swap the supplied ptr and return the previous value + void* Swap(uint32_t id, void* ptr); + // Atomically compare and swap the provided value only if it equals + // to expected value. + bool CompareAndSwap(uint32_t id, void* ptr, void*& expected); + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(uint32_t id, autovector<void*>* ptrs, void* const replacement); + // Update res by applying func on each thread-local value. Holds a lock that + // prevents unref handler from running during this call, but clients must + // still provide external synchronization since the owning thread can + // access the values without internal locking, e.g., via Get() and Reset(). + void Fold(uint32_t id, FoldFunc func, void* res); + + // Register the UnrefHandler for id + void SetHandler(uint32_t id, UnrefHandler handler); + + // protect inst, next_instance_id_, free_instance_ids_, head_, + // ThreadData.entries + // + // Note that here we prefer function static variable instead of the usual + // global static variable. The reason is that c++ destruction order of + // static variables in the reverse order of their construction order. + // However, C++ does not guarantee any construction order when global + // static variables are defined in different files, while the function + // static variables are initialized when their function are first called. + // As a result, the construction order of the function static variables + // can be controlled by properly invoke their first function calls in + // the right order. + // + // For instance, the following function contains a function static + // variable. We place a dummy function call of this inside + // Env::Default() to ensure the construction order of the construction + // order. + static port::Mutex* Mutex(); + + // Returns the member mutex of the current StaticMeta. In general, + // Mutex() should be used instead of this one. However, in case where + // the static variable inside Instance() goes out of scope, MemberMutex() + // should be used. One example is OnThreadExit() function. + port::Mutex* MemberMutex() { return &mutex_; } + +private: + // Get UnrefHandler for id with acquiring mutex + // REQUIRES: mutex locked + UnrefHandler GetHandler(uint32_t id); + + // Triggered before a thread terminates + static void OnThreadExit(void* ptr); + + // Add current thread's ThreadData to the global chain + // REQUIRES: mutex locked + void AddThreadData(ThreadData* d); + + // Remove current thread's ThreadData from the global chain + // REQUIRES: mutex locked + void RemoveThreadData(ThreadData* d); + + static ThreadData* GetThreadLocal(); + + uint32_t next_instance_id_; + // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed + // frequently. This also prevents it from blowing up the vector space. + autovector<uint32_t> free_instance_ids_; + // Chain all thread local structure together. This is necessary since + // when one ThreadLocalPtr gets destroyed, we need to loop over each + // thread's version of pointer corresponding to that instance and + // call UnrefHandler for it. + ThreadData head_; + + std::unordered_map<uint32_t, UnrefHandler> handler_map_; + + // The private mutex. Developers should always use Mutex() instead of + // using this variable directly. + port::Mutex mutex_; +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL + // Thread local storage + static __thread ThreadData* tls_; +#endif + + // Used to make thread exit trigger possible if !defined(OS_MACOSX). + // Otherwise, used to retrieve thread data. + pthread_key_t pthread_key_; +}; + + +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +__thread ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr; +#endif + +// Windows doesn't support a per-thread destructor with its +// TLS primitives. So, we build it manually by inserting a +// function to be called on each thread's exit. +// See http://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way +// and http://www.nynaeve.net/?p=183 +// +// really we do this to have clear conscience since using TLS with thread-pools +// is iffy +// although OK within a request. But otherwise, threads have no identity in its +// modern use. + +// This runs on windows only called from the System Loader +#ifdef OS_WIN + +// Windows cleanup routine is invoked from a System Loader with a different +// signature so we can not directly hookup the original OnThreadExit which is +// private member +// so we make StaticMeta class share with the us the address of the function so +// we can invoke it. +namespace wintlscleanup { + +// This is set to OnThreadExit in StaticMeta singleton constructor +UnrefHandler thread_local_inclass_routine = nullptr; +pthread_key_t thread_local_key = pthread_key_t (-1); + +// Static callback function to call with each thread termination. +void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) { + // We decided to punt on PROCESS_EXIT + if (DLL_THREAD_DETACH == reason) { + if (thread_local_key != pthread_key_t(-1) && + thread_local_inclass_routine != nullptr) { + void* tls = TlsGetValue(thread_local_key); + if (tls != nullptr) { + thread_local_inclass_routine(tls); + } + } + } +} + +} // wintlscleanup + +// extern "C" suppresses C++ name mangling so we know the symbol name for the +// linker /INCLUDE:symbol pragma above. +extern "C" { + +#ifdef _MSC_VER +// The linker must not discard thread_callback_on_exit. (We force a reference +// to this variable with a linker /include:symbol pragma to ensure that.) If +// this variable is discarded, the OnThreadExit function will never be called. +#ifndef _X86_ + +// .CRT section is merged with .rdata on x64 so it must be constant data. +#pragma const_seg(".CRT$XLB") +// When defining a const variable, it must have external linkage to be sure the +// linker doesn't discard it. +extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit; +const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = + wintlscleanup::WinOnThreadExit; +// Reset the default section. +#pragma const_seg() + +#pragma comment(linker, "/include:_tls_used") +#pragma comment(linker, "/include:p_thread_callback_on_exit") + +#else // _X86_ + +#pragma data_seg(".CRT$XLB") +PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit; +// Reset the default section. +#pragma data_seg() + +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit") + +#endif // _X86_ + +#else +// https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc +BOOL WINAPI DllMain(HINSTANCE h, DWORD dwReason, PVOID pv) { + if (dwReason == DLL_THREAD_DETACH) + wintlscleanup::WinOnThreadExit(h, dwReason, pv); + return TRUE; +} +#endif +} // extern "C" + +#endif // OS_WIN + +void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::Instance(); } + +ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() { + // Here we prefer function static variable instead of global + // static variable as function static variable is initialized + // when the function is first call. As a result, we can properly + // control their construction order by properly preparing their + // first function call. + // + // Note that here we decide to make "inst" a static pointer w/o deleting + // it at the end instead of a static variable. This is to avoid the following + // destruction order disaster happens when a child thread using ThreadLocalPtr + // dies AFTER the main thread dies: When a child thread happens to use + // ThreadLocalPtr, it will try to delete its thread-local data on its + // OnThreadExit when the child thread dies. However, OnThreadExit depends + // on the following variable. As a result, if the main thread dies before any + // child thread happen to use ThreadLocalPtr dies, then the destruction of + // the following variable will go first, then OnThreadExit, therefore causing + // invalid access. + // + // The above problem can be solved by using thread_local to store tls_ instead + // of using __thread. The major difference between thread_local and __thread + // is that thread_local supports dynamic construction and destruction of + // non-primitive typed variables. As a result, we can guarantee the + // destruction order even when the main thread dies before any child threads. + // However, thread_local is not supported in all compilers that accept -std=c++11 + // (e.g., eg Mac with XCode < 8. XCode 8+ supports thread_local). + static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta(); + return inst; +} + +port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; } + +void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { + auto* tls = static_cast<ThreadData*>(ptr); + assert(tls != nullptr); + + // Use the cached StaticMeta::Instance() instead of directly calling + // the variable inside StaticMeta::Instance() might already go out of + // scope here in case this OnThreadExit is called after the main thread + // dies. + auto* inst = tls->inst; + pthread_setspecific(inst->pthread_key_, nullptr); + + MutexLock l(inst->MemberMutex()); + inst->RemoveThreadData(tls); + // Unref stored pointers of current thread from all instances + uint32_t id = 0; + for (auto& e : tls->entries) { + void* raw = e.ptr.load(); + if (raw != nullptr) { + auto unref = inst->GetHandler(id); + if (unref != nullptr) { + unref(raw); + } + } + ++id; + } + // Delete thread local structure no matter if it is Mac platform + delete tls; +} + +ThreadLocalPtr::StaticMeta::StaticMeta() + : next_instance_id_(0), + head_(this), + pthread_key_(0) { + if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { + abort(); + } + + // OnThreadExit is not getting called on the main thread. + // Call through the static destructor mechanism to avoid memory leak. + // + // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global + // singleton (destructors are invoked in reverse order of constructor + // _completion_); the latter must not mutate internal members. This + // cleanup mechanism inherently relies on use-after-release of the + // StaticMeta, and is brittle with respect to compiler-specific handling + // of memory backing destructed statically-scoped objects. Perhaps + // registering with atexit(3) would be more robust. + // +// This is not required on Windows. +#if !defined(OS_WIN) + static struct A { + ~A() { +#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL + ThreadData* tls_ = + static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_)); +#endif + if (tls_) { + OnThreadExit(tls_); + } + } + } a; +#endif // !defined(OS_WIN) + + head_.next = &head_; + head_.prev = &head_; + +#ifdef OS_WIN + // Share with Windows its cleanup routine and the key + wintlscleanup::thread_local_inclass_routine = OnThreadExit; + wintlscleanup::thread_local_key = pthread_key_; +#endif +} + +void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) { + Mutex()->AssertHeld(); + d->next = &head_; + d->prev = head_.prev; + head_.prev->next = d; + head_.prev = d; +} + +void ThreadLocalPtr::StaticMeta::RemoveThreadData( + ThreadData* d) { + Mutex()->AssertHeld(); + d->next->prev = d->prev; + d->prev->next = d->next; + d->next = d->prev = d; +} + +ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { +#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL + // Make this local variable name look like a member variable so that we + // can share all the code below + ThreadData* tls_ = + static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_)); +#endif + + if (UNLIKELY(tls_ == nullptr)) { + auto* inst = Instance(); + tls_ = new ThreadData(inst); + { + // Register it in the global chain, needs to be done before thread exit + // handler registration + MutexLock l(Mutex()); + inst->AddThreadData(tls_); + } + // Even it is not OS_MACOSX, need to register value for pthread_key_ so that + // its exit handler will be triggered. + if (pthread_setspecific(inst->pthread_key_, tls_) != 0) { + { + MutexLock l(Mutex()); + inst->RemoveThreadData(tls_); + } + delete tls_; + abort(); + } + } + return tls_; +} + +void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + return nullptr; + } + return tls->entries[id].ptr.load(std::memory_order_acquire); +} + +void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + tls->entries[id].ptr.store(ptr, std::memory_order_release); +} + +void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire); +} + +bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, + void*& expected) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.compare_exchange_strong( + expected, ptr, std::memory_order_release, std::memory_order_relaxed); +} + +void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs, + void* const replacement) { + MutexLock l(Mutex()); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = + t->entries[id].ptr.exchange(replacement, std::memory_order_acquire); + if (ptr != nullptr) { + ptrs->push_back(ptr); + } + } + } +} + +void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) { + MutexLock l(Mutex()); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = t->entries[id].ptr.load(); + if (ptr != nullptr) { + func(ptr, res); + } + } + } +} + +uint32_t ThreadLocalPtr::TEST_PeekId() { + return Instance()->PeekId(); +} + +void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) { + MutexLock l(Mutex()); + handler_map_[id] = handler; +} + +UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) { + Mutex()->AssertHeld(); + auto iter = handler_map_.find(id); + if (iter == handler_map_.end()) { + return nullptr; + } + return iter->second; +} + +uint32_t ThreadLocalPtr::StaticMeta::GetId() { + MutexLock l(Mutex()); + if (free_instance_ids_.empty()) { + return next_instance_id_++; + } + + uint32_t id = free_instance_ids_.back(); + free_instance_ids_.pop_back(); + return id; +} + +uint32_t ThreadLocalPtr::StaticMeta::PeekId() const { + MutexLock l(Mutex()); + if (!free_instance_ids_.empty()) { + return free_instance_ids_.back(); + } + return next_instance_id_; +} + +void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) { + // This id is not used, go through all thread local data and release + // corresponding value + MutexLock l(Mutex()); + auto unref = GetHandler(id); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = t->entries[id].ptr.exchange(nullptr); + if (ptr != nullptr && unref != nullptr) { + unref(ptr); + } + } + } + handler_map_[id] = nullptr; + free_instance_ids_.push_back(id); +} + +ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler) + : id_(Instance()->GetId()) { + if (handler != nullptr) { + Instance()->SetHandler(id_, handler); + } +} + +ThreadLocalPtr::~ThreadLocalPtr() { + Instance()->ReclaimId(id_); +} + +void* ThreadLocalPtr::Get() const { + return Instance()->Get(id_); +} + +void ThreadLocalPtr::Reset(void* ptr) { + Instance()->Reset(id_, ptr); +} + +void* ThreadLocalPtr::Swap(void* ptr) { + return Instance()->Swap(id_, ptr); +} + +bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) { + return Instance()->CompareAndSwap(id_, ptr, expected); +} + +void ThreadLocalPtr::Scrape(autovector<void*>* ptrs, void* const replacement) { + Instance()->Scrape(id_, ptrs, replacement); +} + +void ThreadLocalPtr::Fold(FoldFunc func, void* res) { + Instance()->Fold(id_, func, res); +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/thread_local.h b/src/rocksdb/util/thread_local.h new file mode 100644 index 00000000..5dad7292 --- /dev/null +++ b/src/rocksdb/util/thread_local.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <atomic> +#include <functional> +#include <memory> +#include <unordered_map> +#include <vector> + +#include "util/autovector.h" +#include "port/port.h" + +namespace rocksdb { + +// Cleanup function that will be called for a stored thread local +// pointer (if not NULL) when one of the following happens: +// (1) a thread terminates +// (2) a ThreadLocalPtr is destroyed +// +// Warning: this function is called while holding a global mutex. The same mutex +// is used (at least in some cases) by most methods of ThreadLocalPtr, and it's +// shared across all instances of ThreadLocalPtr. Thereforere extra care +// is needed to avoid deadlocks. In particular, the handler shouldn't lock any +// mutexes and shouldn't call any methods of any ThreadLocalPtr instances, +// unless you know what you're doing. +typedef void (*UnrefHandler)(void* ptr); + +// ThreadLocalPtr stores only values of pointer type. Different from +// the usual thread-local-storage, ThreadLocalPtr has the ability to +// distinguish data coming from different threads and different +// ThreadLocalPtr instances. For example, if a regular thread_local +// variable A is declared in DBImpl, two DBImpl objects would share +// the same A. However, a ThreadLocalPtr that is defined under the +// scope of DBImpl can avoid such confliction. As a result, its memory +// usage would be O(# of threads * # of ThreadLocalPtr instances). +class ThreadLocalPtr { + public: + explicit ThreadLocalPtr(UnrefHandler handler = nullptr); + + ThreadLocalPtr(const ThreadLocalPtr&) = delete; + ThreadLocalPtr& operator=(const ThreadLocalPtr&) = delete; + + ~ThreadLocalPtr(); + + // Return the current pointer stored in thread local + void* Get() const; + + // Set a new pointer value to the thread local storage. + void Reset(void* ptr); + + // Atomically swap the supplied ptr and return the previous value + void* Swap(void* ptr); + + // Atomically compare the stored value with expected. Set the new + // pointer value to thread local only if the comparison is true. + // Otherwise, expected returns the stored value. + // Return true on success, false on failure + bool CompareAndSwap(void* ptr, void*& expected); + + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(autovector<void*>* ptrs, void* const replacement); + + typedef std::function<void(void*, void*)> FoldFunc; + // Update res by applying func on each thread-local value. Holds a lock that + // prevents unref handler from running during this call, but clients must + // still provide external synchronization since the owning thread can + // access the values without internal locking, e.g., via Get() and Reset(). + void Fold(FoldFunc func, void* res); + + // Add here for testing + // Return the next available Id without claiming it + static uint32_t TEST_PeekId(); + + // Initialize the static singletons of the ThreadLocalPtr. + // + // If this function is not called, then the singletons will be + // automatically initialized when they are used. + // + // Calling this function twice or after the singletons have been + // initialized will be no-op. + static void InitSingletons(); + + class StaticMeta; + +private: + + static StaticMeta* Instance(); + + const uint32_t id_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/thread_local_test.cc b/src/rocksdb/util/thread_local_test.cc new file mode 100644 index 00000000..789be83d --- /dev/null +++ b/src/rocksdb/util/thread_local_test.cc @@ -0,0 +1,582 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <thread> +#include <atomic> +#include <string> + +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/autovector.h" +#include "util/sync_point.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/thread_local.h" + +namespace rocksdb { + +class ThreadLocalTest : public testing::Test { + public: + ThreadLocalTest() : env_(Env::Default()) {} + + Env* env_; +}; + +namespace { + +struct Params { + Params(port::Mutex* m, port::CondVar* c, int* u, int n, + UnrefHandler handler = nullptr) + : mu(m), + cv(c), + unref(u), + total(n), + started(0), + completed(0), + doWrite(false), + tls1(handler), + tls2(nullptr) {} + + port::Mutex* mu; + port::CondVar* cv; + int* unref; + int total; + int started; + int completed; + bool doWrite; + ThreadLocalPtr tls1; + ThreadLocalPtr* tls2; +}; + +class IDChecker : public ThreadLocalPtr { +public: + static uint32_t PeekId() { + return TEST_PeekId(); + } +}; + +} // anonymous namespace + +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ +TEST_F(ThreadLocalTest, UniqueIdTest) { + port::Mutex mu; + port::CondVar cv(&mu); + + ASSERT_EQ(IDChecker::PeekId(), 0u); + // New ThreadLocal instance bumps id by 1 + { + // Id used 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Id used 1 + Params p2(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 2u); + // Id used 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // Id used 3 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 4u); + } + // id 3, 2, 1, 0 are in the free queue in order + ASSERT_EQ(IDChecker::PeekId(), 0u); + + // pick up 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 1u); + // pick up 1 + Params* p2 = new Params(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 2u); + // pick up 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // return up 1 + delete p2; + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Now we have 3, 1 in queue + // pick up 1 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // pick up 3 + Params p5(&mu, &cv, nullptr, 1u); + // next new id + ASSERT_EQ(IDChecker::PeekId(), 4u); + // After exit, id sequence in queue: + // 3, 1, 2, 0 +} +#endif // __clang_analyzer__ + +TEST_F(ThreadLocalTest, SequentialReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + ASSERT_EQ(IDChecker::PeekId(), 0u); + + port::Mutex mu; + port::CondVar cv(&mu); + Params p(&mu, &cv, nullptr, 1); + ThreadLocalPtr tls2; + p.tls2 = &tls2; + + auto func = [](void* ptr) { + auto& params = *static_cast<Params*>(ptr); + + ASSERT_TRUE(params.tls1.Get() == nullptr); + params.tls1.Reset(reinterpret_cast<int*>(1)); + ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1)); + params.tls1.Reset(reinterpret_cast<int*>(2)); + ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(2)); + + ASSERT_TRUE(params.tls2->Get() == nullptr); + params.tls2->Reset(reinterpret_cast<int*>(1)); + ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(1)); + params.tls2->Reset(reinterpret_cast<int*>(2)); + ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(2)); + + params.mu->Lock(); + ++(params.completed); + params.cv->SignalAll(); + params.mu->Unlock(); + }; + + for (int iter = 0; iter < 1024; ++iter) { + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Another new thread, read/write should not see value from previous thread + env_->StartThread(func, static_cast<void*>(&p)); + mu.Lock(); + while (p.completed != iter + 1) { + cv.Wait(); + } + mu.Unlock(); + ASSERT_EQ(IDChecker::PeekId(), 1u); + } +} + +TEST_F(ThreadLocalTest, ConcurrentReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + ASSERT_EQ(IDChecker::PeekId(), 0u); + + ThreadLocalPtr tls2; + port::Mutex mu1; + port::CondVar cv1(&mu1); + Params p1(&mu1, &cv1, nullptr, 16); + p1.tls2 = &tls2; + + port::Mutex mu2; + port::CondVar cv2(&mu2); + Params p2(&mu2, &cv2, nullptr, 16); + p2.doWrite = true; + p2.tls2 = &tls2; + + auto func = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + + p.mu->Lock(); + // Size_T switches size along with the ptr size + // we want to cast to. + size_t own = ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + // Let write threads write a different value from the read threads + if (p.doWrite) { + own += 8192; + } + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + auto* env = Env::Default(); + auto start = env->NowMicros(); + + p.tls1.Reset(reinterpret_cast<size_t*>(own)); + p.tls2->Reset(reinterpret_cast<size_t*>(own + 1)); + // Loop for 1 second + while (env->NowMicros() - start < 1000 * 1000) { + for (int iter = 0; iter < 100000; ++iter) { + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<size_t*>(own)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<size_t*>(own + 1)); + if (p.doWrite) { + p.tls1.Reset(reinterpret_cast<size_t*>(own)); + p.tls2->Reset(reinterpret_cast<size_t*>(own + 1)); + } + } + } + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + p.mu->Unlock(); + }; + + // Initiate 2 instnaces: one keeps writing and one keeps reading. + // The read instance should not see data from the write instance. + // Each thread local copy of the value are also different from each + // other. + for (int th = 0; th < p1.total; ++th) { + env_->StartThread(func, static_cast<void*>(&p1)); + } + for (int th = 0; th < p2.total; ++th) { + env_->StartThread(func, static_cast<void*>(&p2)); + } + + mu1.Lock(); + while (p1.completed != p1.total) { + cv1.Wait(); + } + mu1.Unlock(); + + mu2.Lock(); + while (p2.completed != p2.total) { + cv2.Wait(); + } + mu2.Unlock(); + + ASSERT_EQ(IDChecker::PeekId(), 3u); +} + +TEST_F(ThreadLocalTest, Unref) { + ASSERT_EQ(IDChecker::PeekId(), 0u); + + auto unref = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + // Case 0: no unref triggered if ThreadLocalPtr is never accessed + auto func0 = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func0, static_cast<void*>(&p)); + } + env_->WaitForJoin(); + ASSERT_EQ(unref_count, 0); + } + + // Case 1: unref triggered by thread exit + auto func1 = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + ThreadLocalPtr tls2(unref); + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = &tls2; + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func1, static_cast<void*>(&p)); + } + + env_->WaitForJoin(); + + // N threads x 2 ThreadLocal instance cleanup on thread exit + ASSERT_EQ(unref_count, 2 * p.total); + } + + // Case 2: unref triggered by ThreadLocal instance destruction + auto func2 = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func2, static_cast<void*>(&p)); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + // Now destroy one ThreadLocal instance + delete p.tls2; + p.tls2 = nullptr; + // instance destroy for N threads + ASSERT_EQ(unref_count, p.total); + + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + // additional N threads exit unref for the left instance + ASSERT_EQ(unref_count, 2 * p.total); + } +} + +TEST_F(ThreadLocalTest, Swap) { + ThreadLocalPtr tls; + tls.Reset(reinterpret_cast<void*>(1)); + ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(nullptr)), 1); + ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(2)) == nullptr); + ASSERT_EQ(reinterpret_cast<int64_t>(tls.Get()), 2); + ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(reinterpret_cast<void*>(3))), 2); +} + +TEST_F(ThreadLocalTest, Scrape) { + auto unref = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + auto func = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func, static_cast<void*>(&p)); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + ASSERT_EQ(unref_count, 0); + + // Scrape all thread local data. No unref at thread + // exit or ThreadLocalPtr destruction + autovector<void*> ptrs; + p.tls1.Scrape(&ptrs, nullptr); + p.tls2->Scrape(&ptrs, nullptr); + delete p.tls2; + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + + ASSERT_EQ(unref_count, 0); + } +} + +TEST_F(ThreadLocalTest, Fold) { + auto unref = [](void* ptr) { + delete static_cast<std::atomic<int64_t>*>(ptr); + }; + static const int kNumThreads = 16; + static const int kItersPerThread = 10; + port::Mutex mu; + port::CondVar cv(&mu); + Params params(&mu, &cv, nullptr, kNumThreads, unref); + auto func = [](void* ptr) { + auto& p = *static_cast<Params*>(ptr); + ASSERT_TRUE(p.tls1.Get() == nullptr); + p.tls1.Reset(new std::atomic<int64_t>(0)); + + for (int i = 0; i < kItersPerThread; ++i) { + static_cast<std::atomic<int64_t>*>(p.tls1.Get())->fetch_add(1); + } + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 0; th < params.total; ++th) { + env_->StartThread(func, static_cast<void*>(¶ms)); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (params.completed != params.total) { + cv.Wait(); + } + mu.Unlock(); + + // Verify Fold() behavior + int64_t sum = 0; + params.tls1.Fold( + [](void* ptr, void* res) { + auto sum_ptr = static_cast<int64_t*>(res); + *sum_ptr += static_cast<std::atomic<int64_t>*>(ptr)->load(); + }, + &sum); + ASSERT_EQ(sum, kNumThreads * kItersPerThread); + + // Signal to exit + mu.Lock(); + params.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); +} + +TEST_F(ThreadLocalTest, CompareAndSwap) { + ThreadLocalPtr tls; + ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(1)) == nullptr); + void* expected = reinterpret_cast<void*>(1); + // Swap in 2 + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(2), expected)); + expected = reinterpret_cast<void*>(100); + // Fail Swap, still 2 + ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast<void*>(2), expected)); + ASSERT_EQ(expected, reinterpret_cast<void*>(2)); + // Swap in 3 + expected = reinterpret_cast<void*>(2); + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(3), expected)); + ASSERT_EQ(tls.Get(), reinterpret_cast<void*>(3)); +} + +namespace { + +void* AccessThreadLocal(void* /*arg*/) { + TEST_SYNC_POINT("AccessThreadLocal:Start"); + ThreadLocalPtr tlp; + tlp.Reset(new std::string("hello RocksDB")); + TEST_SYNC_POINT("AccessThreadLocal:End"); + return nullptr; +} + +} // namespace + +// The following test is disabled as it requires manual steps to run it +// correctly. +// +// Currently we have no way to acess SyncPoint w/o ASAN error when the +// child thread dies after the main thread dies. So if you manually enable +// this test and only see an ASAN error on SyncPoint, it means you pass the +// test. +TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"AccessThreadLocal:Start", "MainThreadDiesFirst:End"}, + {"PosixEnv::~PosixEnv():End", "AccessThreadLocal:End"}}); + + // Triggers the initialization of singletons. + Env::Default(); + +#ifndef ROCKSDB_LITE + try { +#endif // ROCKSDB_LITE + rocksdb::port::Thread th(&AccessThreadLocal, nullptr); + th.detach(); + TEST_SYNC_POINT("MainThreadDiesFirst:End"); +#ifndef ROCKSDB_LITE + } catch (const std::system_error& ex) { + std::cerr << "Start thread: " << ex.code() << std::endl; + FAIL(); + } +#endif // ROCKSDB_LITE +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/thread_operation.h b/src/rocksdb/util/thread_operation.h new file mode 100644 index 00000000..f1827da0 --- /dev/null +++ b/src/rocksdb/util/thread_operation.h @@ -0,0 +1,121 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines the structures for thread operation and state. +// Thread operations are used to describe high level action of a +// thread such as doing compaction or flush, while thread state +// are used to describe lower-level action such as reading / +// writing a file or waiting for a mutex. Operations and states +// are designed to be independent. Typically, a thread usually involves +// in one operation and one state at any specific point in time. + +#pragma once + +#include "rocksdb/thread_status.h" + +#include <string> + +namespace rocksdb { + +#ifdef ROCKSDB_USING_THREAD_STATUS + +// The structure that describes a major thread operation. +struct OperationInfo { + const ThreadStatus::OperationType type; + const std::string name; +}; + +// The global operation table. +// +// When updating a status of a thread, the pointer of the OperationInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +// +// Note that it's not designed to be constant as in the future we +// might consider adding global count to the OperationInfo. +static OperationInfo global_operation_table[] = { + {ThreadStatus::OP_UNKNOWN, ""}, + {ThreadStatus::OP_COMPACTION, "Compaction"}, + {ThreadStatus::OP_FLUSH, "Flush"} +}; + +struct OperationStageInfo { + const ThreadStatus::OperationStage stage; + const std::string name; +}; + +// A table maintains the mapping from stage type to stage string. +// Note that the string must be changed accordingly when the +// associated function name changed. +static OperationStageInfo global_op_stage_table[] = { + {ThreadStatus::STAGE_UNKNOWN, ""}, + {ThreadStatus::STAGE_FLUSH_RUN, + "FlushJob::Run"}, + {ThreadStatus::STAGE_FLUSH_WRITE_L0, + "FlushJob::WriteLevel0Table"}, + {ThreadStatus::STAGE_COMPACTION_PREPARE, + "CompactionJob::Prepare"}, + {ThreadStatus::STAGE_COMPACTION_RUN, + "CompactionJob::Run"}, + {ThreadStatus::STAGE_COMPACTION_PROCESS_KV, + "CompactionJob::ProcessKeyValueCompaction"}, + {ThreadStatus::STAGE_COMPACTION_INSTALL, + "CompactionJob::Install"}, + {ThreadStatus::STAGE_COMPACTION_SYNC_FILE, + "CompactionJob::FinishCompactionOutputFile"}, + {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH, + "MemTableList::PickMemtablesToFlush"}, + {ThreadStatus::STAGE_MEMTABLE_ROLLBACK, + "MemTableList::RollbackMemtableFlush"}, + {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, + "MemTableList::TryInstallMemtableFlushResults"}, +}; + +// The structure that describes a state. +struct StateInfo { + const ThreadStatus::StateType type; + const std::string name; +}; + +// The global state table. +// +// When updating a status of a thread, the pointer of the StateInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +static StateInfo global_state_table[] = { + {ThreadStatus::STATE_UNKNOWN, ""}, + {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"}, +}; + +struct OperationProperty { + int code; + std::string name; +}; + +static OperationProperty compaction_operation_properties[] = { + {ThreadStatus::COMPACTION_JOB_ID, "JobID"}, + {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"}, + {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"}, + {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"}, + {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"}, + {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"}, +}; + +static OperationProperty flush_operation_properties[] = { + {ThreadStatus::FLUSH_JOB_ID, "JobID"}, + {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"}, + {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"} +}; + +#else + +struct OperationInfo { +}; + +struct StateInfo { +}; + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace rocksdb diff --git a/src/rocksdb/util/threadpool_imp.cc b/src/rocksdb/util/threadpool_imp.cc new file mode 100644 index 00000000..acac0063 --- /dev/null +++ b/src/rocksdb/util/threadpool_imp.cc @@ -0,0 +1,508 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/threadpool_imp.h" + +#include "monitoring/thread_status_util.h" +#include "port/port.h" + +#ifndef OS_WIN +# include <unistd.h> +#endif + +#ifdef OS_LINUX +# include <sys/syscall.h> +# include <sys/resource.h> +#endif + +#include <stdlib.h> +#include <algorithm> +#include <atomic> +#include <condition_variable> +#include <mutex> +#include <sstream> +#include <thread> +#include <vector> + +namespace rocksdb { + +void ThreadPoolImpl::PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +struct ThreadPoolImpl::Impl { + + Impl(); + ~Impl(); + + void JoinThreads(bool wait_for_jobs_to_complete); + + void SetBackgroundThreadsInternal(int num, bool allow_reduce); + int GetBackgroundThreads(); + + unsigned int GetQueueLen() const { + return queue_len_.load(std::memory_order_relaxed); + } + + void LowerIOPriority(); + + void LowerCPUPriority(); + + void WakeUpAllThreads() { + bgsignal_.notify_all(); + } + + void BGThread(size_t thread_id); + + void StartBGThreads(); + + void Submit(std::function<void()>&& schedule, + std::function<void()>&& unschedule, void* tag); + + int UnSchedule(void* arg); + + void SetHostEnv(Env* env) { env_ = env; } + + Env* GetHostEnv() const { return env_; } + + bool HasExcessiveThread() const { + return static_cast<int>(bgthreads_.size()) > total_threads_limit_; + } + + // Return true iff the current thread is the excessive thread to terminate. + // Always terminate the running thread that is added last, even if there are + // more than one thread to terminate. + bool IsLastExcessiveThread(size_t thread_id) const { + return HasExcessiveThread() && thread_id == bgthreads_.size() - 1; + } + + bool IsExcessiveThread(size_t thread_id) const { + return static_cast<int>(thread_id) >= total_threads_limit_; + } + + // Return the thread priority. + // This would allow its member-thread to know its priority. + Env::Priority GetThreadPriority() const { return priority_; } + + // Set the thread priority. + void SetThreadPriority(Env::Priority priority) { priority_ = priority; } + +private: + + static void* BGThreadWrapper(void* arg); + + bool low_io_priority_; + bool low_cpu_priority_; + Env::Priority priority_; + Env* env_; + + int total_threads_limit_; + std::atomic_uint queue_len_; // Queue length. Used for stats reporting + bool exit_all_threads_; + bool wait_for_jobs_to_complete_; + + // Entry per Schedule()/Submit() call + struct BGItem { + void* tag = nullptr; + std::function<void()> function; + std::function<void()> unschedFunction; + }; + + using BGQueue = std::deque<BGItem>; + BGQueue queue_; + + std::mutex mu_; + std::condition_variable bgsignal_; + std::vector<port::Thread> bgthreads_; +}; + + +inline +ThreadPoolImpl::Impl::Impl() + : + low_io_priority_(false), + low_cpu_priority_(false), + priority_(Env::LOW), + env_(nullptr), + total_threads_limit_(0), + queue_len_(), + exit_all_threads_(false), + wait_for_jobs_to_complete_(false), + queue_(), + mu_(), + bgsignal_(), + bgthreads_() { +} + +inline +ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); } + +void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) { + + std::unique_lock<std::mutex> lock(mu_); + assert(!exit_all_threads_); + + wait_for_jobs_to_complete_ = wait_for_jobs_to_complete; + exit_all_threads_ = true; + // prevent threads from being recreated right after they're joined, in case + // the user is concurrently submitting jobs. + total_threads_limit_ = 0; + + lock.unlock(); + + bgsignal_.notify_all(); + + for (auto& th : bgthreads_) { + th.join(); + } + + bgthreads_.clear(); + + exit_all_threads_ = false; + wait_for_jobs_to_complete_ = false; +} + +inline +void ThreadPoolImpl::Impl::LowerIOPriority() { + std::lock_guard<std::mutex> lock(mu_); + low_io_priority_ = true; +} + +inline +void ThreadPoolImpl::Impl::LowerCPUPriority() { + std::lock_guard<std::mutex> lock(mu_); + low_cpu_priority_ = true; +} + +void ThreadPoolImpl::Impl::BGThread(size_t thread_id) { + bool low_io_priority = false; + bool low_cpu_priority = false; + + while (true) { + // Wait until there is an item that is ready to run + std::unique_lock<std::mutex> lock(mu_); + // Stop waiting if the thread needs to do work or needs to terminate. + while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) && + (queue_.empty() || IsExcessiveThread(thread_id))) { + bgsignal_.wait(lock); + } + + if (exit_all_threads_) { // mechanism to let BG threads exit safely + + if (!wait_for_jobs_to_complete_ || + queue_.empty()) { + break; + } + } + + if (IsLastExcessiveThread(thread_id)) { + // Current thread is the last generated one and is excessive. + // We always terminate excessive thread in the reverse order of + // generation time. + auto& terminating_thread = bgthreads_.back(); + terminating_thread.detach(); + bgthreads_.pop_back(); + + if (HasExcessiveThread()) { + // There is still at least more excessive thread to terminate. + WakeUpAllThreads(); + } + break; + } + + auto func = std::move(queue_.front().function); + queue_.pop_front(); + + queue_len_.store(static_cast<unsigned int>(queue_.size()), + std::memory_order_relaxed); + + bool decrease_io_priority = (low_io_priority != low_io_priority_); + bool decrease_cpu_priority = (low_cpu_priority != low_cpu_priority_); + lock.unlock(); + +#ifdef OS_LINUX + if (decrease_cpu_priority) { + setpriority( + PRIO_PROCESS, + // Current thread. + 0, + // Lowest priority possible. + 19); + low_cpu_priority = true; + } + + if (decrease_io_priority) { +#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) + // Put schedule into IOPRIO_CLASS_IDLE class (lowest) + // These system calls only have an effect when used in conjunction + // with an I/O scheduler that supports I/O priorities. As at + // kernel 2.6.17 the only such scheduler is the Completely + // Fair Queuing (CFQ) I/O scheduler. + // To change scheduler: + // echo cfq > /sys/block/<device_name>/queue/schedule + // Tunables to consider: + // /sys/block/<device_name>/queue/slice_idle + // /sys/block/<device_name>/queue/slice_sync + syscall(SYS_ioprio_set, 1, // IOPRIO_WHO_PROCESS + 0, // current thread + IOPRIO_PRIO_VALUE(3, 0)); + low_io_priority = true; + } +#else + (void)decrease_io_priority; // avoid 'unused variable' error + (void)decrease_cpu_priority; +#endif + func(); + } +} + +// Helper struct for passing arguments when creating threads. +struct BGThreadMetadata { + ThreadPoolImpl::Impl* thread_pool_; + size_t thread_id_; // Thread count in the thread. + BGThreadMetadata(ThreadPoolImpl::Impl* thread_pool, size_t thread_id) + : thread_pool_(thread_pool), thread_id_(thread_id) {} +}; + +void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) { + BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg); + size_t thread_id = meta->thread_id_; + ThreadPoolImpl::Impl* tp = meta->thread_pool_; +#ifdef ROCKSDB_USING_THREAD_STATUS + // initialize it because compiler isn't good enough to see we don't use it + // uninitialized + ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES; + switch (tp->GetThreadPriority()) { + case Env::Priority::HIGH: + thread_type = ThreadStatus::HIGH_PRIORITY; + break; + case Env::Priority::LOW: + thread_type = ThreadStatus::LOW_PRIORITY; + break; + case Env::Priority::BOTTOM: + thread_type = ThreadStatus::BOTTOM_PRIORITY; + break; + case Env::Priority::USER: + thread_type = ThreadStatus::USER; + break; + case Env::Priority::TOTAL: + assert(false); + return nullptr; + } + assert(thread_type != ThreadStatus::NUM_THREAD_TYPES); + ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type); +#endif + delete meta; + tp->BGThread(thread_id); +#ifdef ROCKSDB_USING_THREAD_STATUS + ThreadStatusUtil::UnregisterThread(); +#endif + return nullptr; +} + +void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num, + bool allow_reduce) { + std::unique_lock<std::mutex> lock(mu_); + if (exit_all_threads_) { + lock.unlock(); + return; + } + if (num > total_threads_limit_ || + (num < total_threads_limit_ && allow_reduce)) { + total_threads_limit_ = std::max(0, num); + WakeUpAllThreads(); + StartBGThreads(); + } +} + +int ThreadPoolImpl::Impl::GetBackgroundThreads() { + std::unique_lock<std::mutex> lock(mu_); + return total_threads_limit_; +} + +void ThreadPoolImpl::Impl::StartBGThreads() { + // Start background thread if necessary + while ((int)bgthreads_.size() < total_threads_limit_) { + + port::Thread p_t(&BGThreadWrapper, + new BGThreadMetadata(this, bgthreads_.size())); + +// Set the thread name to aid debugging +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + auto th_handle = p_t.native_handle(); + std::string thread_priority = Env::PriorityToString(GetThreadPriority()); + std::ostringstream thread_name_stream; + thread_name_stream << "rocksdb:"; + for (char c : thread_priority) { + thread_name_stream << static_cast<char>(tolower(c)); + } + thread_name_stream << bgthreads_.size(); + pthread_setname_np(th_handle, thread_name_stream.str().c_str()); +#endif +#endif + bgthreads_.push_back(std::move(p_t)); + } +} + +void ThreadPoolImpl::Impl::Submit(std::function<void()>&& schedule, + std::function<void()>&& unschedule, void* tag) { + + std::lock_guard<std::mutex> lock(mu_); + + if (exit_all_threads_) { + return; + } + + StartBGThreads(); + + // Add to priority queue + queue_.push_back(BGItem()); + + auto& item = queue_.back(); + item.tag = tag; + item.function = std::move(schedule); + item.unschedFunction = std::move(unschedule); + + queue_len_.store(static_cast<unsigned int>(queue_.size()), + std::memory_order_relaxed); + + if (!HasExcessiveThread()) { + // Wake up at least one waiting thread. + bgsignal_.notify_one(); + } else { + // Need to wake up all threads to make sure the one woken + // up is not the one to terminate. + WakeUpAllThreads(); + } +} + +int ThreadPoolImpl::Impl::UnSchedule(void* arg) { + int count = 0; + + std::vector<std::function<void()>> candidates; + { + std::lock_guard<std::mutex> lock(mu_); + + // Remove from priority queue + BGQueue::iterator it = queue_.begin(); + while (it != queue_.end()) { + if (arg == (*it).tag) { + if (it->unschedFunction) { + candidates.push_back(std::move(it->unschedFunction)); + } + it = queue_.erase(it); + count++; + } else { + ++it; + } + } + queue_len_.store(static_cast<unsigned int>(queue_.size()), + std::memory_order_relaxed); + } + + + // Run unschedule functions outside the mutex + for (auto& f : candidates) { + f(); + } + + return count; +} + +ThreadPoolImpl::ThreadPoolImpl() : + impl_(new Impl()) { +} + + +ThreadPoolImpl::~ThreadPoolImpl() { +} + +void ThreadPoolImpl::JoinAllThreads() { + impl_->JoinThreads(false); +} + +void ThreadPoolImpl::SetBackgroundThreads(int num) { + impl_->SetBackgroundThreadsInternal(num, true); +} + +int ThreadPoolImpl::GetBackgroundThreads() { + return impl_->GetBackgroundThreads(); +} + +unsigned int ThreadPoolImpl::GetQueueLen() const { + return impl_->GetQueueLen(); +} + +void ThreadPoolImpl::WaitForJobsAndJoinAllThreads() { + impl_->JoinThreads(true); +} + +void ThreadPoolImpl::LowerIOPriority() { + impl_->LowerIOPriority(); +} + +void ThreadPoolImpl::LowerCPUPriority() { + impl_->LowerCPUPriority(); +} + +void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) { + impl_->SetBackgroundThreadsInternal(num, false); +} + +void ThreadPoolImpl::SubmitJob(const std::function<void()>& job) { + auto copy(job); + impl_->Submit(std::move(copy), std::function<void()>(), nullptr); +} + + +void ThreadPoolImpl::SubmitJob(std::function<void()>&& job) { + impl_->Submit(std::move(job), std::function<void()>(), nullptr); +} + +void ThreadPoolImpl::Schedule(void(*function)(void* arg1), void* arg, + void* tag, void(*unschedFunction)(void* arg)) { + if (unschedFunction == nullptr) { + impl_->Submit(std::bind(function, arg), std::function<void()>(), tag); + } else { + impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg), + tag); + } +} + +int ThreadPoolImpl::UnSchedule(void* arg) { + return impl_->UnSchedule(arg); +} + +void ThreadPoolImpl::SetHostEnv(Env* env) { impl_->SetHostEnv(env); } + +Env* ThreadPoolImpl::GetHostEnv() const { return impl_->GetHostEnv(); } + +// Return the thread priority. +// This would allow its member-thread to know its priority. +Env::Priority ThreadPoolImpl::GetThreadPriority() const { + return impl_->GetThreadPriority(); +} + +// Set the thread priority. +void ThreadPoolImpl::SetThreadPriority(Env::Priority priority) { + impl_->SetThreadPriority(priority); +} + +ThreadPool* NewThreadPool(int num_threads) { + ThreadPoolImpl* thread_pool = new ThreadPoolImpl(); + thread_pool->SetBackgroundThreads(num_threads); + return thread_pool; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/threadpool_imp.h b/src/rocksdb/util/threadpool_imp.h new file mode 100644 index 00000000..3cdafb83 --- /dev/null +++ b/src/rocksdb/util/threadpool_imp.h @@ -0,0 +1,113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "rocksdb/threadpool.h" +#include "rocksdb/env.h" + +#include <memory> +#include <functional> + +namespace rocksdb { + + +class ThreadPoolImpl : public ThreadPool { + public: + ThreadPoolImpl(); + ~ThreadPoolImpl(); + + ThreadPoolImpl(ThreadPoolImpl&&) = delete; + ThreadPoolImpl& operator=(ThreadPoolImpl&&) = delete; + + // Implement ThreadPool interfaces + + // Wait for all threads to finish. + // Discards all the jobs that did not + // start executing and waits for those running + // to complete + void JoinAllThreads() override; + + // Set the number of background threads that will be executing the + // scheduled jobs. + void SetBackgroundThreads(int num) override; + int GetBackgroundThreads() override; + + // Get the number of jobs scheduled in the ThreadPool queue. + unsigned int GetQueueLen() const override; + + // Waits for all jobs to complete those + // that already started running and those that did not + // start yet + void WaitForJobsAndJoinAllThreads() override; + + // Make threads to run at a lower kernel IO priority + // Currently only has effect on Linux + void LowerIOPriority(); + + // Make threads to run at a lower kernel CPU priority + // Currently only has effect on Linux + void LowerCPUPriority(); + + // Ensure there is at aleast num threads in the pool + // but do not kill threads if there are more + void IncBackgroundThreadsIfNeeded(int num); + + // Submit a fire and forget job + // These jobs can not be unscheduled + + // This allows to submit the same job multiple times + void SubmitJob(const std::function<void()>&) override; + // This moves the function in for efficiency + void SubmitJob(std::function<void()>&&) override; + + // Schedule a job with an unschedule tag and unschedule function + // Can be used to filter and unschedule jobs by a tag + // that are still in the queue and did not start running + void Schedule(void (*function)(void* arg1), void* arg, void* tag, + void (*unschedFunction)(void* arg)); + + // Filter jobs that are still in a queue and match + // the given tag. Remove them from a queue if any + // and for each such job execute an unschedule function + // if such was given at scheduling time. + int UnSchedule(void* tag); + + void SetHostEnv(Env* env); + + Env* GetHostEnv() const; + + // Return the thread priority. + // This would allow its member-thread to know its priority. + Env::Priority GetThreadPriority() const; + + // Set the thread priority. + void SetThreadPriority(Env::Priority priority); + + static void PthreadCall(const char* label, int result); + + struct Impl; + + private: + + // Current public virtual interface does not provide usable + // functionality and thus can not be used internally to + // facade different implementations. + // + // We propose a pimpl idiom in order to easily replace the thread pool impl + // w/o touching the header file but providing a different .cc potentially + // CMake option driven. + // + // Another option is to introduce a Env::MakeThreadPool() virtual interface + // and override the environment. This would require refactoring ThreadPool usage. + // + // We can also combine these two approaches + std::unique_ptr<Impl> impl_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/timer_queue.h b/src/rocksdb/util/timer_queue.h new file mode 100644 index 00000000..bd8a4f85 --- /dev/null +++ b/src/rocksdb/util/timer_queue.h @@ -0,0 +1,230 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +// Timer Queue +// +// License +// +// The source code in this article is licensed under the CC0 license, so feel +// free to copy, modify, share, do whatever you want with it. +// No attribution is required, but Ill be happy if you do. +// CC0 license + +// The person who associated a work with this deed has dedicated the work to the +// public domain by waiving all of his or her rights to the work worldwide +// under copyright law, including all related and neighboring rights, to the +// extent allowed by law. You can copy, modify, distribute and perform the +// work, even for commercial purposes, all without asking permission. + +#pragma once + +#include <assert.h> +#include <chrono> +#include <condition_variable> +#include <functional> +#include <queue> +#include <thread> +#include <utility> +#include <vector> + +#include "port/port.h" +#include "util/sync_point.h" + +// Allows execution of handlers at a specified time in the future +// Guarantees: +// - All handlers are executed ONCE, even if cancelled (aborted parameter will +// be set to true) +// - If TimerQueue is destroyed, it will cancel all handlers. +// - Handlers are ALWAYS executed in the Timer Queue worker thread. +// - Handlers execution order is NOT guaranteed +// +//////////////////////////////////////////////////////////////////////////////// +// borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +class TimerQueue { + public: + TimerQueue() : m_th(&TimerQueue::run, this) {} + + ~TimerQueue() { shutdown(); } + + // This function is not thread-safe. + void shutdown() { + if (closed_) { + return; + } + cancelAll(); + // Abusing the timer queue to trigger the shutdown. + add(0, [this](bool) { + m_finish = true; + return std::make_pair(false, 0); + }); + m_th.join(); + closed_ = true; + } + + // Adds a new timer + // \return + // Returns the ID of the new timer. You can use this ID to cancel the + // timer + uint64_t add(int64_t milliseconds, + std::function<std::pair<bool, int64_t>(bool)> handler) { + WorkItem item; + Clock::time_point tp = Clock::now(); + item.end = tp + std::chrono::milliseconds(milliseconds); + TEST_SYNC_POINT_CALLBACK("TimeQueue::Add:item.end", &item.end); + item.period = milliseconds; + item.handler = std::move(handler); + + std::unique_lock<std::mutex> lk(m_mtx); + uint64_t id = ++m_idcounter; + item.id = id; + m_items.push(std::move(item)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return id; + } + + // Cancels the specified timer + // \return + // 1 if the timer was cancelled. + // 0 if you were too late to cancel (or the timer ID was never valid to + // start with) + size_t cancel(uint64_t id) { + // Instead of removing the item from the container (thus breaking the + // heap integrity), we set the item as having no handler, and put + // that handler on a new item at the top for immediate execution + // The timer thread will then ignore the original item, since it has no + // handler. + std::unique_lock<std::mutex> lk(m_mtx); + for (auto&& item : m_items.getContainer()) { + if (item.id == id && item.handler) { + WorkItem newItem; + // Zero time, so it stays at the top for immediate execution + newItem.end = Clock::time_point(); + newItem.id = 0; // Means it is a canceled item + // Move the handler from item to newitem (thus clearing item) + newItem.handler = std::move(item.handler); + m_items.push(std::move(newItem)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return 1; + } + } + return 0; + } + + // Cancels all timers + // \return + // The number of timers cancelled + size_t cancelAll() { + // Setting all "end" to 0 (for immediate execution) is ok, + // since it maintains the heap integrity + std::unique_lock<std::mutex> lk(m_mtx); + m_cancel = true; + for (auto&& item : m_items.getContainer()) { + if (item.id && item.handler) { + item.end = Clock::time_point(); + item.id = 0; + } + } + auto ret = m_items.size(); + + m_checkWork.notify_one(); + return ret; + } + + private: + using Clock = std::chrono::steady_clock; + TimerQueue(const TimerQueue&) = delete; + TimerQueue& operator=(const TimerQueue&) = delete; + + void run() { + std::unique_lock<std::mutex> lk(m_mtx); + while (!m_finish) { + auto end = calcWaitTime_lock(); + if (end.first) { + // Timers found, so wait until it expires (or something else + // changes) + m_checkWork.wait_until(lk, end.second); + } else { + // No timers exist, so wait forever until something changes + m_checkWork.wait(lk); + } + + // Check and execute as much work as possible, such as, all expired + // timers + checkWork(&lk); + } + + // If we are shutting down, we should not have any items left, + // since the shutdown cancels all items + assert(m_items.size() == 0); + } + + std::pair<bool, Clock::time_point> calcWaitTime_lock() { + while (m_items.size()) { + if (m_items.top().handler) { + // Item present, so return the new wait time + return std::make_pair(true, m_items.top().end); + } else { + // Discard empty handlers (they were cancelled) + m_items.pop(); + } + } + + // No items found, so return no wait time (causes the thread to wait + // indefinitely) + return std::make_pair(false, Clock::time_point()); + } + + void checkWork(std::unique_lock<std::mutex>* lk) { + while (m_items.size() && m_items.top().end <= Clock::now()) { + WorkItem item(m_items.top()); + m_items.pop(); + + if (item.handler) { + (*lk).unlock(); + auto reschedule_pair = item.handler(item.id == 0); + (*lk).lock(); + if (!m_cancel && reschedule_pair.first) { + int64_t new_period = (reschedule_pair.second == -1) + ? item.period + : reschedule_pair.second; + + item.period = new_period; + item.end = Clock::now() + std::chrono::milliseconds(new_period); + m_items.push(std::move(item)); + } + } + } + } + + bool m_finish = false; + bool m_cancel = false; + uint64_t m_idcounter = 0; + std::condition_variable m_checkWork; + + struct WorkItem { + Clock::time_point end; + int64_t period; + uint64_t id; // id==0 means it was cancelled + std::function<std::pair<bool, int64_t>(bool)> handler; + bool operator>(const WorkItem& other) const { return end > other.end; } + }; + + std::mutex m_mtx; + // Inheriting from priority_queue, so we can access the internal container + class Queue : public std::priority_queue<WorkItem, std::vector<WorkItem>, + std::greater<WorkItem>> { + public: + std::vector<WorkItem>& getContainer() { return this->c; } + } m_items; + rocksdb::port::Thread m_th; + bool closed_ = false; +}; diff --git a/src/rocksdb/util/timer_queue_test.cc b/src/rocksdb/util/timer_queue_test.cc new file mode 100644 index 00000000..5f5f08f2 --- /dev/null +++ b/src/rocksdb/util/timer_queue_test.cc @@ -0,0 +1,72 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +// Timer Queue +// +// License +// +// The source code in this article is licensed under the CC0 license, so feel +// free +// to copy, modify, share, do whatever you want with it. +// No attribution is required, but Ill be happy if you do. +// CC0 license + +// The person who associated a work with this deed has dedicated the work to the +// public domain by waiving all of his or her rights to the work worldwide +// under copyright law, including all related and neighboring rights, to the +// extent allowed by law. You can copy, modify, distribute and perform the +// work, even for +// commercial purposes, all without asking permission. See Other Information +// below. +// + +#include "util/timer_queue.h" +#include <future> + +namespace Timing { + +using Clock = std::chrono::high_resolution_clock; +double now() { + static auto start = Clock::now(); + return std::chrono::duration<double, std::milli>(Clock::now() - start) + .count(); +} + +} // namespace Timing + +int main() { + TimerQueue q; + + double tnow = Timing::now(); + + q.add(10000, [tnow](bool aborted) mutable { + printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(false, 0); + }); + q.add(10001, [tnow](bool aborted) mutable { + printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(false, 0); + }); + + q.add(1000, [tnow](bool aborted) mutable { + printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(!aborted, 1000); + }); + + auto id = q.add(2000, [tnow](bool aborted) mutable { + printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(!aborted, 2000); + }); + + (void)id; + // auto ret = q.cancel(id); + // assert(ret == 1); + // q.cancelAll(); + + return 0; +} +////////////////////////////////////////// diff --git a/src/rocksdb/util/trace_replay.cc b/src/rocksdb/util/trace_replay.cc new file mode 100644 index 00000000..28160b29 --- /dev/null +++ b/src/rocksdb/util/trace_replay.cc @@ -0,0 +1,300 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/trace_replay.h" + +#include <chrono> +#include <sstream> +#include <thread> +#include "db/db_impl.h" +#include "rocksdb/slice.h" +#include "rocksdb/write_batch.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace rocksdb { + +const std::string kTraceMagic = "feedcafedeadbeef"; + +namespace { +void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) { + PutFixed32(dst, cf_id); + PutLengthPrefixedSlice(dst, key); +} + +void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) { + Slice buf(buffer); + GetFixed32(&buf, cf_id); + GetLengthPrefixedSlice(&buf, key); +} +} // namespace + +Tracer::Tracer(Env* env, const TraceOptions& trace_options, + std::unique_ptr<TraceWriter>&& trace_writer) + : env_(env), + trace_options_(trace_options), + trace_writer_(std::move(trace_writer)), + trace_request_count_ (0) { + WriteHeader(); +} + +Tracer::~Tracer() { trace_writer_.reset(); } + +Status Tracer::Write(WriteBatch* write_batch) { + TraceType trace_type = kTraceWrite; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = env_->NowMicros(); + trace.type = trace_type; + trace.payload = write_batch->Data(); + return WriteTrace(trace); +} + +Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) { + TraceType trace_type = kTraceGet; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = env_->NowMicros(); + trace.type = trace_type; + EncodeCFAndKey(&trace.payload, column_family->GetID(), key); + return WriteTrace(trace); +} + +Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) { + TraceType trace_type = kTraceIteratorSeek; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = env_->NowMicros(); + trace.type = trace_type; + EncodeCFAndKey(&trace.payload, cf_id, key); + return WriteTrace(trace); +} + +Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) { + TraceType trace_type = kTraceIteratorSeekForPrev; + if (ShouldSkipTrace(trace_type)) { + return Status::OK(); + } + Trace trace; + trace.ts = env_->NowMicros(); + trace.type = trace_type; + EncodeCFAndKey(&trace.payload, cf_id, key); + return WriteTrace(trace); +} + +bool Tracer::ShouldSkipTrace(const TraceType& trace_type) { + if (IsTraceFileOverMax()) { + return true; + } + if ((trace_options_.filter & kTraceFilterGet + && trace_type == kTraceGet) + || (trace_options_.filter & kTraceFilterWrite + && trace_type == kTraceWrite)) { + return true; + } + ++trace_request_count_; + if (trace_request_count_ < trace_options_.sampling_frequency) { + return true; + } + trace_request_count_ = 0; + return false; +} + +bool Tracer::IsTraceFileOverMax() { + uint64_t trace_file_size = trace_writer_->GetFileSize(); + return (trace_file_size > trace_options_.max_trace_file_size); +} + +Status Tracer::WriteHeader() { + std::ostringstream s; + s << kTraceMagic << "\t" + << "Trace Version: 0.1\t" + << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t" + << "Format: Timestamp OpType Payload\n"; + std::string header(s.str()); + + Trace trace; + trace.ts = env_->NowMicros(); + trace.type = kTraceBegin; + trace.payload = header; + return WriteTrace(trace); +} + +Status Tracer::WriteFooter() { + Trace trace; + trace.ts = env_->NowMicros(); + trace.type = kTraceEnd; + trace.payload = ""; + return WriteTrace(trace); +} + +Status Tracer::WriteTrace(const Trace& trace) { + std::string encoded_trace; + PutFixed64(&encoded_trace, trace.ts); + encoded_trace.push_back(trace.type); + PutFixed32(&encoded_trace, static_cast<uint32_t>(trace.payload.size())); + encoded_trace.append(trace.payload); + return trace_writer_->Write(Slice(encoded_trace)); +} + +Status Tracer::Close() { return WriteFooter(); } + +Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles, + std::unique_ptr<TraceReader>&& reader) + : trace_reader_(std::move(reader)) { + assert(db != nullptr); + db_ = static_cast<DBImpl*>(db->GetRootDB()); + for (ColumnFamilyHandle* cfh : handles) { + cf_map_[cfh->GetID()] = cfh; + } +} + +Replayer::~Replayer() { trace_reader_.reset(); } + +Status Replayer::Replay() { + Status s; + Trace header; + s = ReadHeader(&header); + if (!s.ok()) { + return s; + } + + std::chrono::system_clock::time_point replay_epoch = + std::chrono::system_clock::now(); + WriteOptions woptions; + ReadOptions roptions; + Trace trace; + uint64_t ops = 0; + Iterator* single_iter = nullptr; + while (s.ok()) { + trace.reset(); + s = ReadTrace(&trace); + if (!s.ok()) { + break; + } + + std::this_thread::sleep_until( + replay_epoch + std::chrono::microseconds(trace.ts - header.ts)); + if (trace.type == kTraceWrite) { + WriteBatch batch(trace.payload); + db_->Write(woptions, &batch); + ops++; + } else if (trace.type == kTraceGet) { + uint32_t cf_id = 0; + Slice key; + DecodeCFAndKey(trace.payload, &cf_id, &key); + if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) { + return Status::Corruption("Invalid Column Family ID."); + } + + std::string value; + if (cf_id == 0) { + db_->Get(roptions, key, &value); + } else { + db_->Get(roptions, cf_map_[cf_id], key, &value); + } + ops++; + } else if (trace.type == kTraceIteratorSeek) { + uint32_t cf_id = 0; + Slice key; + DecodeCFAndKey(trace.payload, &cf_id, &key); + if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) { + return Status::Corruption("Invalid Column Family ID."); + } + + if (cf_id == 0) { + single_iter = db_->NewIterator(roptions); + } else { + single_iter = db_->NewIterator(roptions, cf_map_[cf_id]); + } + single_iter->Seek(key); + ops++; + delete single_iter; + } else if (trace.type == kTraceIteratorSeekForPrev) { + // Currently, only support to call the Seek() + uint32_t cf_id = 0; + Slice key; + DecodeCFAndKey(trace.payload, &cf_id, &key); + if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) { + return Status::Corruption("Invalid Column Family ID."); + } + + if (cf_id == 0) { + single_iter = db_->NewIterator(roptions); + } else { + single_iter = db_->NewIterator(roptions, cf_map_[cf_id]); + } + single_iter->SeekForPrev(key); + ops++; + delete single_iter; + } else if (trace.type == kTraceEnd) { + // Do nothing for now. + // TODO: Add some validations later. + break; + } + } + + if (s.IsIncomplete()) { + // Reaching eof returns Incomplete status at the moment. + // Could happen when killing a process without calling EndTrace() API. + // TODO: Add better error handling. + return Status::OK(); + } + return s; +} + +Status Replayer::ReadHeader(Trace* header) { + assert(header != nullptr); + Status s = ReadTrace(header); + if (!s.ok()) { + return s; + } + if (header->type != kTraceBegin) { + return Status::Corruption("Corrupted trace file. Incorrect header."); + } + if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) { + return Status::Corruption("Corrupted trace file. Incorrect magic."); + } + + return s; +} + +Status Replayer::ReadFooter(Trace* footer) { + assert(footer != nullptr); + Status s = ReadTrace(footer); + if (!s.ok()) { + return s; + } + if (footer->type != kTraceEnd) { + return Status::Corruption("Corrupted trace file. Incorrect footer."); + } + + // TODO: Add more validations later + return s; +} + +Status Replayer::ReadTrace(Trace* trace) { + assert(trace != nullptr); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + + Slice enc_slice = Slice(encoded_trace); + GetFixed64(&enc_slice, &trace->ts); + trace->type = static_cast<TraceType>(enc_slice[0]); + enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize); + trace->payload = enc_slice.ToString(); + return s; +} + +} // namespace rocksdb diff --git a/src/rocksdb/util/trace_replay.h b/src/rocksdb/util/trace_replay.h new file mode 100644 index 00000000..749ea2f6 --- /dev/null +++ b/src/rocksdb/util/trace_replay.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> +#include <unordered_map> +#include <utility> + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/trace_reader_writer.h" + +namespace rocksdb { + +class ColumnFamilyHandle; +class ColumnFamilyData; +class DB; +class DBImpl; +class Slice; +class WriteBatch; + +extern const std::string kTraceMagic; +const unsigned int kTraceTimestampSize = 8; +const unsigned int kTraceTypeSize = 1; +const unsigned int kTracePayloadLengthSize = 4; +const unsigned int kTraceMetadataSize = + kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize; + +enum TraceType : char { + kTraceBegin = 1, + kTraceEnd = 2, + kTraceWrite = 3, + kTraceGet = 4, + kTraceIteratorSeek = 5, + kTraceIteratorSeekForPrev = 6, + kTraceMax, +}; + +// TODO: This should also be made part of public interface to help users build +// custom TracerReaders and TraceWriters. +struct Trace { + uint64_t ts; + TraceType type; + std::string payload; + + void reset() { + ts = 0; + type = kTraceMax; + payload.clear(); + } +}; + +// Trace RocksDB operations using a TraceWriter. +class Tracer { + public: + Tracer(Env* env, const TraceOptions& trace_options, + std::unique_ptr<TraceWriter>&& trace_writer); + ~Tracer(); + + Status Write(WriteBatch* write_batch); + Status Get(ColumnFamilyHandle* cfname, const Slice& key); + Status IteratorSeek(const uint32_t& cf_id, const Slice& key); + Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); + bool IsTraceFileOverMax(); + + Status Close(); + + private: + Status WriteHeader(); + Status WriteFooter(); + Status WriteTrace(const Trace& trace); + bool ShouldSkipTrace(const TraceType& type); + + Env* env_; + TraceOptions trace_options_; + std::unique_ptr<TraceWriter> trace_writer_; + uint64_t trace_request_count_; +}; + +// Replay RocksDB operations from a trace. +class Replayer { + public: + Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles, + std::unique_ptr<TraceReader>&& reader); + ~Replayer(); + + Status Replay(); + + private: + Status ReadHeader(Trace* header); + Status ReadFooter(Trace* footer); + Status ReadTrace(Trace* trace); + + DBImpl* db_; + std::unique_ptr<TraceReader> trace_reader_; + std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/transaction_test_util.cc b/src/rocksdb/util/transaction_test_util.cc new file mode 100644 index 00000000..30cff11e --- /dev/null +++ b/src/rocksdb/util/transaction_test_util.cc @@ -0,0 +1,385 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "util/transaction_test_util.h" + +#include <inttypes.h> +#include <algorithm> +#include <numeric> +#include <random> +#include <string> +#include <thread> + +#include "rocksdb/db.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" + +#include "db/dbformat.h" +#include "db/snapshot_impl.h" +#include "util/logging.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace rocksdb { + +RandomTransactionInserter::RandomTransactionInserter( + Random64* rand, const WriteOptions& write_options, + const ReadOptions& read_options, uint64_t num_keys, uint16_t num_sets, + const uint64_t cmt_delay_ms, const uint64_t first_id) + : rand_(rand), + write_options_(write_options), + read_options_(read_options), + num_keys_(num_keys), + num_sets_(num_sets), + txn_id_(first_id), + cmt_delay_ms_(cmt_delay_ms) {} + +RandomTransactionInserter::~RandomTransactionInserter() { + if (txn_ != nullptr) { + delete txn_; + } + if (optimistic_txn_ != nullptr) { + delete optimistic_txn_; + } +} + +bool RandomTransactionInserter::TransactionDBInsert( + TransactionDB* db, const TransactionOptions& txn_options) { + txn_ = db->BeginTransaction(write_options_, txn_options, txn_); + + std::hash<std::thread::id> hasher; + char name[64]; + snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%" PRIu64, + hasher(std::this_thread::get_id()), txn_id_++); + assert(strlen(name) < 64 - 1); + assert(txn_->SetName(name).ok()); + + // Take a snapshot if set_snapshot was not set or with 50% change otherwise + bool take_snapshot = txn_->GetSnapshot() == nullptr || rand_->OneIn(2); + if (take_snapshot) { + txn_->SetSnapshot(); + read_options_.snapshot = txn_->GetSnapshot(); + } + auto res = DoInsert(db, txn_, false); + if (take_snapshot) { + read_options_.snapshot = nullptr; + } + return res; +} + +bool RandomTransactionInserter::OptimisticTransactionDBInsert( + OptimisticTransactionDB* db, + const OptimisticTransactionOptions& txn_options) { + optimistic_txn_ = + db->BeginTransaction(write_options_, txn_options, optimistic_txn_); + + return DoInsert(db, optimistic_txn_, true); +} + +bool RandomTransactionInserter::DBInsert(DB* db) { + return DoInsert(db, nullptr, false); +} + +Status RandomTransactionInserter::DBGet( + DB* db, Transaction* txn, ReadOptions& read_options, uint16_t set_i, + uint64_t ikey, bool get_for_update, uint64_t* int_value, + std::string* full_key, bool* unexpected_error) { + Status s; + // Five digits (since the largest uint16_t is 65535) plus the NUL + // end char. + char prefix_buf[6]; + // Pad prefix appropriately so we can iterate over each set + assert(set_i + 1 <= 9999); + snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1); + // key format: [SET#][random#] + std::string skey = ToString(ikey); + Slice base_key(skey); + *full_key = std::string(prefix_buf) + base_key.ToString(); + Slice key(*full_key); + + std::string value; + if (txn != nullptr) { + if (get_for_update) { + s = txn->GetForUpdate(read_options, key, &value); + } else { + s = txn->Get(read_options, key, &value); + } + } else { + s = db->Get(read_options, key, &value); + } + + if (s.ok()) { + // Found key, parse its value + *int_value = std::stoull(value); + if (*int_value == 0 || *int_value == ULONG_MAX) { + *unexpected_error = true; + fprintf(stderr, "Get returned unexpected value: %s\n", value.c_str()); + s = Status::Corruption(); + } + } else if (s.IsNotFound()) { + // Have not yet written to this key, so assume its value is 0 + *int_value = 0; + s = Status::OK(); + } + return s; +} + +bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn, + bool is_optimistic) { + Status s; + WriteBatch batch; + + // pick a random number to use to increment a key in each set + uint64_t incr = (rand_->Next() % 100) + 1; + bool unexpected_error = false; + + std::vector<uint16_t> set_vec(num_sets_); + std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0)); + std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{}); + + // For each set, pick a key at random and increment it + for (uint16_t set_i : set_vec) { + uint64_t int_value = 0; + std::string full_key; + uint64_t rand_key = rand_->Next() % num_keys_; + const bool get_for_update = txn ? rand_->OneIn(2) : false; + s = DBGet(db, txn, read_options_, set_i, rand_key, get_for_update, + &int_value, &full_key, &unexpected_error); + Slice key(full_key); + if (!s.ok()) { + // Optimistic transactions should never return non-ok status here. + // Non-optimistic transactions may return write-coflict/timeout errors. + if (is_optimistic || !(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { + fprintf(stderr, "Get returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + break; + } + + if (s.ok()) { + // Increment key + std::string sum = ToString(int_value + incr); + if (txn != nullptr) { + s = txn->Put(key, sum); + if (!get_for_update && (s.IsBusy() || s.IsTimedOut())) { + // If the initial get was not for update, then the key is not locked + // before put and put could fail due to concurrent writes. + break; + } else if (!s.ok()) { + // Since we did a GetForUpdate, Put should not fail. + fprintf(stderr, "Put returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + } else { + batch.Put(key, sum); + } + bytes_inserted_ += key.size() + sum.size(); + } + if (txn != nullptr) { + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Insert (%s) %s snap: %" PRIu64 " key:%s value: %" PRIu64 + "+%" PRIu64 "=%" PRIu64, + txn->GetName().c_str(), s.ToString().c_str(), + txn->GetSnapshot()->GetSequenceNumber(), full_key.c_str(), + int_value, incr, int_value + incr); + } + } + + if (s.ok()) { + if (txn != nullptr) { + bool with_prepare = !is_optimistic && !rand_->OneIn(10); + if (with_prepare) { + // Also try commit without prepare + s = txn->Prepare(); + assert(s.ok()); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Prepare of %" PRIu64 " %s (%s)", txn->GetId(), + s.ToString().c_str(), txn->GetName().c_str()); + db->GetDBOptions().env->SleepForMicroseconds( + static_cast<int>(cmt_delay_ms_ * 1000)); + } + if (!rand_->OneIn(20)) { + s = txn->Commit(); + assert(!with_prepare || s.ok()); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Commit of %" PRIu64 " %s (%s)", txn->GetId(), + s.ToString().c_str(), txn->GetName().c_str()); + } else { + // Also try 5% rollback + s = txn->Rollback(); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, + "Rollback %" PRIu64 " %s %s", txn->GetId(), + txn->GetName().c_str(), s.ToString().c_str()); + assert(s.ok()); + } + assert(is_optimistic || s.ok()); + + if (!s.ok()) { + if (is_optimistic) { + // Optimistic transactions can have write-conflict errors on commit. + // Any other error is unexpected. + if (!(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { + unexpected_error = true; + } + } else { + // Non-optimistic transactions should only fail due to expiration + // or write failures. For testing purproses, we do not expect any + // write failures. + if (!s.IsExpired()) { + unexpected_error = true; + } + } + + if (unexpected_error) { + fprintf(stderr, "Commit returned an unexpected error: %s\n", + s.ToString().c_str()); + } + } + } else { + s = db->Write(write_options_, &batch); + if (!s.ok()) { + unexpected_error = true; + fprintf(stderr, "Write returned an unexpected error: %s\n", + s.ToString().c_str()); + } + } + } else { + if (txn != nullptr) { + assert(txn->Rollback().ok()); + ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Error %s for txn %s", + s.ToString().c_str(), txn->GetName().c_str()); + } + } + + if (s.ok()) { + success_count_++; + } else { + failure_count_++; + } + + last_status_ = s; + + // return success if we didn't get any unexpected errors + return !unexpected_error; +} + +// Verify that the sum of the keys in each set are equal +Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets, + uint64_t num_keys_per_set, + bool take_snapshot, Random64* rand, + uint64_t delay_ms) { + // delay_ms is the delay between taking a snapshot and doing the reads. It + // emulates reads from a long-running backup job. + assert(delay_ms == 0 || take_snapshot); + uint64_t prev_total = 0; + uint32_t prev_i = 0; + bool prev_assigned = false; + + ReadOptions roptions; + if (take_snapshot) { + roptions.snapshot = db->GetSnapshot(); + db->GetDBOptions().env->SleepForMicroseconds( + static_cast<int>(delay_ms * 1000)); + } + + std::vector<uint16_t> set_vec(num_sets); + std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0)); + std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{}); + + // For each set of keys with the same prefix, sum all the values + for (uint16_t set_i : set_vec) { + // Five digits (since the largest uint16_t is 65535) plus the NUL + // end char. + char prefix_buf[6]; + assert(set_i + 1 <= 9999); + snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1); + uint64_t total = 0; + + // Use either point lookup or iterator. Point lookups are slower so we use + // it less often. + const bool use_point_lookup = + num_keys_per_set != 0 && rand && rand->OneIn(10); + if (use_point_lookup) { + ReadOptions read_options; + for (uint64_t k = 0; k < num_keys_per_set; k++) { + std::string dont_care; + uint64_t int_value = 0; + bool unexpected_error = false; + const bool FOR_UPDATE = false; + Status s = DBGet(db, nullptr, roptions, set_i, k, FOR_UPDATE, + &int_value, &dont_care, &unexpected_error); + assert(s.ok()); + assert(!unexpected_error); + total += int_value; + } + } else { // user iterators + Iterator* iter = db->NewIterator(roptions); + for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + // stop when we reach a different prefix + if (key.ToString().compare(0, 4, prefix_buf) != 0) { + break; + } + Slice value = iter->value(); + uint64_t int_value = std::stoull(value.ToString()); + if (int_value == 0 || int_value == ULONG_MAX) { + fprintf(stderr, "Iter returned unexpected value: %s\n", + value.ToString().c_str()); + return Status::Corruption(); + } + ROCKS_LOG_DEBUG( + db->GetDBOptions().info_log, + "VerifyRead at %" PRIu64 " (%" PRIu64 "): %.*s value: %" PRIu64, + roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul, + roptions.snapshot + ? ((SnapshotImpl*)roptions.snapshot)->min_uncommitted_ + : 0ul, + static_cast<int>(key.size()), key.data(), int_value); + total += int_value; + } + delete iter; + } + + if (prev_assigned && total != prev_total) { + db->GetDBOptions().info_log->Flush(); + fprintf(stdout, + "RandomTransactionVerify found inconsistent totals using " + "pointlookup? %d " + "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64 + " at snapshot %" PRIu64 "\n", + use_point_lookup, prev_i, prev_total, set_i, total, + roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul); + fflush(stdout); + return Status::Corruption(); + } else { + ROCKS_LOG_DEBUG( + db->GetDBOptions().info_log, + "RandomTransactionVerify pass pointlookup? %d total: %" PRIu64 + " snap: %" PRIu64, + use_point_lookup, total, + roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul); + } + prev_total = total; + prev_i = set_i; + prev_assigned = true; + } + if (take_snapshot) { + db->ReleaseSnapshot(roptions.snapshot); + } + + return Status::OK(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/util/transaction_test_util.h b/src/rocksdb/util/transaction_test_util.h new file mode 100644 index 00000000..1aa4196a --- /dev/null +++ b/src/rocksdb/util/transaction_test_util.h @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/options.h" +#include "port/port.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction_db.h" + +namespace rocksdb { + +class DB; +class Random64; + +// Utility class for stress testing transactions. Can be used to write many +// transactions in parallel and then validate that the data written is logically +// consistent. This class assumes the input DB is initially empty. +// +// Each call to TransactionDBInsert()/OptimisticTransactionDBInsert() will +// increment the value of a key in #num_sets sets of keys. Regardless of +// whether the transaction succeeds, the total sum of values of keys in each +// set is an invariant that should remain equal. +// +// After calling TransactionDBInsert()/OptimisticTransactionDBInsert() many +// times, Verify() can be called to validate that the invariant holds. +// +// To test writing Transaction in parallel, multiple threads can create a +// RandomTransactionInserter with similar arguments using the same DB. +class RandomTransactionInserter { + public: + // num_keys is the number of keys in each set. + // num_sets is the number of sets of keys. + // cmt_delay_ms is the delay between prepare (if there is any) and commit + // first_id is the id of the first transaction + explicit RandomTransactionInserter( + Random64* rand, const WriteOptions& write_options = WriteOptions(), + const ReadOptions& read_options = ReadOptions(), uint64_t num_keys = 1000, + uint16_t num_sets = 3, const uint64_t cmt_delay_ms = 0, + const uint64_t first_id = 0); + + ~RandomTransactionInserter(); + + // Increment a key in each set using a Transaction on a TransactionDB. + // + // Returns true if the transaction succeeded OR if any error encountered was + // expected (eg a write-conflict). Error status may be obtained by calling + // GetLastStatus(); + bool TransactionDBInsert( + TransactionDB* db, + const TransactionOptions& txn_options = TransactionOptions()); + + // Increment a key in each set using a Transaction on an + // OptimisticTransactionDB + // + // Returns true if the transaction succeeded OR if any error encountered was + // expected (eg a write-conflict). Error status may be obtained by calling + // GetLastStatus(); + bool OptimisticTransactionDBInsert( + OptimisticTransactionDB* db, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions()); + // Increment a key in each set without using a transaction. If this function + // is called in parallel, then Verify() may fail. + // + // Returns true if the write succeeds. + // Error status may be obtained by calling GetLastStatus(). + bool DBInsert(DB* db); + + // Get the ikey'th key from set set_i + static Status DBGet(DB* db, Transaction* txn, ReadOptions& read_options, + uint16_t set_i, uint64_t ikey, bool get_for_update, + uint64_t* int_value, std::string* full_key, + bool* unexpected_error); + + // Returns OK if Invariant is true. + static Status Verify(DB* db, uint16_t num_sets, uint64_t num_keys_per_set = 0, + bool take_snapshot = false, Random64* rand = nullptr, + uint64_t delay_ms = 0); + + // Returns the status of the previous Insert operation + Status GetLastStatus() { return last_status_; } + + // Returns the number of successfully written calls to + // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert + uint64_t GetSuccessCount() { return success_count_; } + + // Returns the number of calls to + // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert that did not + // write any data. + uint64_t GetFailureCount() { return failure_count_; } + + // Returns the sum of user keys/values Put() to the DB. + size_t GetBytesInserted() { return bytes_inserted_; } + + private: + // Input options + Random64* rand_; + const WriteOptions write_options_; + ReadOptions read_options_; + const uint64_t num_keys_; + const uint16_t num_sets_; + + // Number of successful insert batches performed + uint64_t success_count_ = 0; + + // Number of failed insert batches attempted + uint64_t failure_count_ = 0; + + size_t bytes_inserted_ = 0; + + // Status returned by most recent insert operation + Status last_status_; + + // optimization: re-use allocated transaction objects. + Transaction* txn_ = nullptr; + Transaction* optimistic_txn_ = nullptr; + + uint64_t txn_id_; + // The delay between ::Prepare and ::Commit + const uint64_t cmt_delay_ms_; + + bool DoInsert(DB* db, Transaction* txn, bool is_optimistic); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/util/user_comparator_wrapper.h b/src/rocksdb/util/user_comparator_wrapper.h new file mode 100644 index 00000000..43797709 --- /dev/null +++ b/src/rocksdb/util/user_comparator_wrapper.h @@ -0,0 +1,65 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/comparator.h" + +namespace rocksdb { + +// Wrapper of user comparator, with auto increment to +// perf_context.user_key_comparison_count. +class UserComparatorWrapper final : public Comparator { + public: + explicit UserComparatorWrapper(const Comparator* const user_cmp) + : user_comparator_(user_cmp) {} + + ~UserComparatorWrapper() = default; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const Slice& a, const Slice& b) const override { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->Compare(a, b); + } + + bool Equal(const Slice& a, const Slice& b) const override { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->Equal(a, b); + } + + const char* Name() const override { return user_comparator_->Name(); } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + return user_comparator_->FindShortestSeparator(start, limit); + } + + void FindShortSuccessor(std::string* key) const override { + return user_comparator_->FindShortSuccessor(key); + } + + const Comparator* GetRootComparator() const override { + return user_comparator_->GetRootComparator(); + } + + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + return user_comparator_->IsSameLengthImmediateSuccessor(s, t); + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return user_comparator_->CanKeysWithDifferentByteContentsBeEqual(); + } + + private: + const Comparator* user_comparator_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/util.h b/src/rocksdb/util/util.h new file mode 100644 index 00000000..a5fd3649 --- /dev/null +++ b/src/rocksdb/util/util.h @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef FALLTHROUGH_INTENDED +#if defined(__clang__) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#else +#define FALLTHROUGH_INTENDED do {} while (0) +#endif +#endif diff --git a/src/rocksdb/util/vector_iterator.h b/src/rocksdb/util/vector_iterator.h new file mode 100644 index 00000000..da60eb22 --- /dev/null +++ b/src/rocksdb/util/vector_iterator.h @@ -0,0 +1,100 @@ +#pragma once + +#include <algorithm> +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "table/internal_iterator.h" + +namespace rocksdb { + +// Iterator over a vector of keys/values +class VectorIterator : public InternalIterator { + public: + VectorIterator(std::vector<std::string> keys, std::vector<std::string> values, + const InternalKeyComparator* icmp) + : keys_(std::move(keys)), + values_(std::move(values)), + indexed_cmp_(icmp, &keys_), + current_(keys.size()) { + assert(keys_.size() == values_.size()); + + indices_.reserve(keys_.size()); + for (size_t i = 0; i < keys_.size(); i++) { + indices_.push_back(i); + } + std::sort(indices_.begin(), indices_.end(), indexed_cmp_); + } + + virtual bool Valid() const override { + return !indices_.empty() && current_ < indices_.size(); + } + + virtual void SeekToFirst() override { current_ = 0; } + virtual void SeekToLast() override { current_ = indices_.size() - 1; } + + virtual void Seek(const Slice& target) override { + current_ = std::lower_bound(indices_.begin(), indices_.end(), target, + indexed_cmp_) - + indices_.begin(); + } + + virtual void SeekForPrev(const Slice& target) override { + current_ = std::lower_bound(indices_.begin(), indices_.end(), target, + indexed_cmp_) - + indices_.begin(); + if (!Valid()) { + SeekToLast(); + } else { + Prev(); + } + } + + virtual void Next() override { current_++; } + virtual void Prev() override { current_--; } + + virtual Slice key() const override { + return Slice(keys_[indices_[current_]]); + } + virtual Slice value() const override { + return Slice(values_[indices_[current_]]); + } + + virtual Status status() const override { return Status::OK(); } + + virtual bool IsKeyPinned() const override { return true; } + virtual bool IsValuePinned() const override { return true; } + + private: + struct IndexedKeyComparator { + IndexedKeyComparator(const InternalKeyComparator* c, + const std::vector<std::string>* ks) + : cmp(c), keys(ks) {} + + bool operator()(size_t a, size_t b) const { + return cmp->Compare((*keys)[a], (*keys)[b]) < 0; + } + + bool operator()(size_t a, const Slice& b) const { + return cmp->Compare((*keys)[a], b) < 0; + } + + bool operator()(const Slice& a, size_t b) const { + return cmp->Compare(a, (*keys)[b]) < 0; + } + + const InternalKeyComparator* cmp; + const std::vector<std::string>* keys; + }; + + std::vector<std::string> keys_; + std::vector<std::string> values_; + IndexedKeyComparator indexed_cmp_; + std::vector<size_t> indices_; + size_t current_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/util/xxhash.cc b/src/rocksdb/util/xxhash.cc new file mode 100644 index 00000000..2ec95a63 --- /dev/null +++ b/src/rocksdb/util/xxhash.cc @@ -0,0 +1,1074 @@ +/* +xxHash - Fast Hash algorithm +Copyright (C) 2012-2014, Yann Collet. +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- xxHash source repository : http://code.google.com/p/xxhash/ +*/ + + +//************************************** +// Tuning parameters +//************************************** +/*!XXH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. Unfortunately, on some target/compiler combinations, the + * generated assembly is sub-optimal. The below switch allow to select different + * access method for improved performance. Method 0 (default) : use `memcpy()`. + * Safe and portable. Method 1 : `__packed` statement. It depends on compiler + * extension (ie, not portable). This method is safe if your compiler supports + * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct + * access. This method doesn't depend on compiler but violate C standard. It can + * generate buggy code on targets which do not support unaligned memory + * accesses. But in some circumstances, it's the only known way to get the most + * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947 + * for details. Prefer these methods in priority order (0 > 1 > 2) + */ + +#include "util/util.h" + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \ + for example */ +#if defined(__GNUC__) && \ + (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)) +#define XXH_FORCE_MEMORY_ACCESS 2 +#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && \ + (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7S__))) +#define XXH_FORCE_MEMORY_ACCESS 1 +#endif +#endif + +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected. +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance. +// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32). +#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_USE_UNALIGNED_ACCESS 1 +#endif + +// XXH_ACCEPT_NULL_INPUT_POINTER : +// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. +// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. +// This option has a very small performance cost (only measurable on small inputs). +// By default, this option is disabled. To enable it, uncomment below define : +//#define XXH_ACCEPT_NULL_INPUT_POINTER 1 + +// XXH_FORCE_NATIVE_FORMAT : +// By default, xxHash library provides endian-independent Hash values, based on little-endian convention. +// Results are therefore identical for little-endian and big-endian CPU. +// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. +// Should endian-independence be of no importance for your application, you may set the #define below to 1. +// It will improve speed for Big-endian CPU. +// This option has no impact on Little_Endian CPU. +#define XXH_FORCE_NATIVE_FORMAT 0 + +/*!XXH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; + * set it to 0 when the input is guaranteed to be aligned, + * or when alignment doesn't matter for performance. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \ + defined(_M_X64) +#define XXH_FORCE_ALIGN_CHECK 0 +#else +#define XXH_FORCE_ALIGN_CHECK 1 +#endif +#endif + +//************************************** +// Compiler Specific Options +//************************************** +// Disable some Visual warning messages +#ifdef _MSC_VER // Visual Studio +# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant +# pragma warning(disable : 4804) // disable: C4804: 'operation' : unsafe use of type 'bool' in operation (static assert line 313) +#endif + +#ifdef _MSC_VER // Visual Studio +# define FORCE_INLINE static __forceinline +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + + +//************************************** +// Includes & Memory related functions +//************************************** +#include "xxhash.h" +// Modify the local functions below should you wish to use some other memory related routines +// for malloc(), free() +#include <stdlib.h> +FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); } +FORCE_INLINE void XXH_free (void* p) { free(p); } +// for memcpy() +#include <string.h> +FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } +#include <assert.h> /* assert */ + +namespace rocksdb { +//************************************** +// Basic Types +//************************************** +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include <stdint.h> + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(XXH_USE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# ifdef __IBMC__ +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct _U32_S { U32 v; } _PACKED U32_S; + +#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# pragma pack(pop) +#endif + +#define A32(x) (((U32_S *)(x))->v) + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware */ +static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1)) + +/* __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { + U32 u32; +} __attribute__((packed)) unalign; +static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ +static U32 XXH_read32(const void* memPtr) { + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +//*************************************** +// Compiler-specific Functions and Macros +//*************************************** +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// Note : although _rotl exists for minGW (GCC under windows), performance seems poor +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +#define XXH_rotl64(x, r) _rotl64(x, r) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) // Visual Studio +# define XXH_swap32 _byteswap_ulong +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static inline U32 XXH_swap32 (U32 x) { + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff );} +#endif + + +//************************************** +// Constants +//************************************** +#define PRIME32_1 2654435761U +#define PRIME32_2 2246822519U +#define PRIME32_3 3266489917U +#define PRIME32_4 668265263U +#define PRIME32_5 374761393U + + +//************************************** +// Architecture Macros +//************************************** +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; +#ifndef XXH_CPU_LITTLE_ENDIAN // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch + static const int one = 1; +# define XXH_CPU_LITTLE_ENDIAN (*(char*)(&one)) +#endif + + +//************************************** +// Macros +//************************************** +#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } // use only *after* variable declarations + + +//**************************** +// Memory reads +//**************************** +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); + else + return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr); +} + +FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, + XXH_alignment align) { + if (align == XXH_unaligned) + return endian == XXH_littleEndian ? XXH_read32(ptr) + : XXH_swap32(XXH_read32(ptr)); + else + return endian == XXH_littleEndian ? *(const U32*)ptr + : XXH_swap32(*(const U32*)ptr); +} + +FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) { + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +//**************************** +// Simple Hash Functions +//**************************** +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + U32 h32; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; } +#endif + + if (len>=16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do + { + v1 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; + v2 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; + v3 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; + v4 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; + } while (p<=limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } + else + { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p<=bEnd-4) + { + h32 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p<bEnd) + { + h32 += (*p) * PRIME32_5; + h32 = XXH_rotl32(h32, 11) * PRIME32_1 ; + p++; + } + + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +U32 XXH32(const void* input, int len, U32 seed) +{ +#if 0 + // Simple version, good for code maintenance, but unfortunately slow for small inputs + void* state = XXH32_init(seed); + XXH32_update(state, input, len); + return XXH32_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + +# if !defined(XXH_USE_UNALIGNED_ACCESS) + if ((((size_t)input) & 3)) // Input is aligned, let's leverage the speed advantage + { + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } +# endif + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +//**************************** +// Advanced Hash Functions +//**************************** + +struct XXH_state32_t +{ + U64 total_len; + U32 seed; + U32 v1; + U32 v2; + U32 v3; + U32 v4; + int memsize; + char memory[16]; +}; + + +int XXH32_sizeofState() +{ + XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t)); // A compilation error here means XXH32_SIZEOFSTATE is not large enough + return sizeof(struct XXH_state32_t); +} + + +XXH_errorcode XXH32_resetState(void* state_in, U32 seed) +{ + struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; + state->seed = seed; + state->v1 = seed + PRIME32_1 + PRIME32_2; + state->v2 = seed + PRIME32_2; + state->v3 = seed + 0; + state->v4 = seed - PRIME32_1; + state->total_len = 0; + state->memsize = 0; + return XXH_OK; +} + + +void* XXH32_init (U32 seed) +{ + void* state = XXH_malloc (sizeof(struct XXH_state32_t)); + XXH32_resetState(state, seed); + return state; +} + + +FORCE_INLINE XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian) +{ + struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 16) // fill in tmp buffer + { + XXH_memcpy(state->memory + state->memsize, input, len); + state->memsize += len; + return XXH_OK; + } + + if (state->memsize) // some data left from previous update + { + XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize); + { + const U32* p32 = (const U32*)state->memory; + state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++; + state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; + state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++; + state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++; + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do + { + v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; + v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; + v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; + v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) + { + XXH_memcpy(state->memory, p, bEnd-p); + state->memsize = (int)(bEnd-p); + } + + return XXH_OK; +} + +XXH_errorcode XXH32_update (void* state_in, const void* input, int len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE U32 XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian) +{ + struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; + const BYTE * p = (const BYTE*)state->memory; + BYTE* bEnd = (BYTE*)state->memory + state->memsize; + U32 h32; + + if (state->total_len >= 16) + { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } + else + { + h32 = state->seed + PRIME32_5; + } + + h32 += (U32) state->total_len; + + while (p<=bEnd-4) + { + h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + p+=4; + } + + while (p<bEnd) + { + h32 += (*p) * PRIME32_5; + h32 = XXH_rotl32(h32, 11) * PRIME32_1; + p++; + } + + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +U32 XXH32_intermediateDigest (void* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian); + else + return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian); +} + + +U32 XXH32_digest (void* state_in) +{ + U32 h32 = XXH32_intermediateDigest(state_in); + + XXH_free(state_in); + + return h32; +} + +/* ******************************************************************* + * 64-bit hash functions + *********************************************************************/ + + #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + + /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ + static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; } + + #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + + /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ + /* currently only defined for gcc and icc */ + typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64; + static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + + #else + + /* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + + static U64 XXH_read64(const void* memPtr) + { + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; + } +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +#define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +#define XXH_swap64 __builtin_bswap64 +#else +static U64 XXH_swap64(U64 x) { + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + +FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, + XXH_alignment align) { + if (align == XXH_unaligned) + return endian == XXH_littleEndian ? XXH_read64(ptr) + : XXH_swap64(XXH_read64(ptr)); + else + return endian == XXH_littleEndian ? *(const U64*)ptr + : XXH_swap64(*(const U64*)ptr); +} + +FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) { + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + +static U64 XXH_readBE64(const void* ptr) { + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} + +/*====== xxh64 ======*/ + +static const U64 PRIME64_1 = + 11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 + */ +static const U64 PRIME64_2 = + 14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 + */ +static const U64 PRIME64_3 = + 1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 + */ +static const U64 PRIME64_4 = + 9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 + */ +static const U64 PRIME64_5 = + 2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 + */ + +static U64 XXH64_round(U64 acc, U64 input) { + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static U64 XXH64_mergeRound(U64 acc, U64 val) { + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +static U64 XXH64_avalanche(U64 h64) { + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +static U64 XXH64_finalize(U64 h64, const void* ptr, size_t len, + XXH_endianess endian, XXH_alignment align) { + const BYTE* p = (const BYTE*)ptr; + +#define PROCESS1_64 \ + h64 ^= (*p++) * PRIME64_5; \ + h64 = XXH_rotl64(h64, 11) * PRIME64_1; + +#define PROCESS4_64 \ + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \ + p += 4; \ + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + +#define PROCESS8_64 \ + { \ + U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \ + p += 8; \ + h64 ^= k1; \ + h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \ + } + + switch (len & 31) { + case 24: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 16: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 8: + PROCESS8_64; + return XXH64_avalanche(h64); + + case 28: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 20: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 12: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 4: + PROCESS4_64; + return XXH64_avalanche(h64); + + case 25: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 17: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 9: + PROCESS8_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 29: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 21: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 13: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 5: + PROCESS4_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 26: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 18: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 10: + PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 30: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 22: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 14: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 6: + PROCESS4_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 27: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 19: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 11: + PROCESS8_64; + PROCESS1_64; + PROCESS1_64; + PROCESS1_64; + return XXH64_avalanche(h64); + + case 31: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 23: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 15: + PROCESS8_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 7: + PROCESS4_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 3: + PROCESS1_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 2: + PROCESS1_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 1: + PROCESS1_64; + FALLTHROUGH_INTENDED; + /* fallthrough */ + case 0: + return XXH64_avalanche(h64); + } + + /* impossible to reach */ + assert(0); + return 0; /* unreachable, but some compilers complain without it */ +} + +FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, + XXH_endianess endian, XXH_alignment align) { + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + if (p == NULL) { + len = 0; + bEnd = p = (const BYTE*)(size_t)32; + } +#endif + + if (len >= 32) { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(p)); + p += 8; + v2 = XXH64_round(v2, XXH_get64bits(p)); + p += 8; + v3 = XXH64_round(v3, XXH_get64bits(p)); + p += 8; + v4 = XXH64_round(v4, XXH_get64bits(p)); + p += 8; + } while (p <= limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (U64)len; + + return XXH64_finalize(h64, p, len, endian, align); +} + +unsigned long long XXH64(const void* input, size_t len, + unsigned long long seed) { +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, input, len); + return XXH64_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7) == + 0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, + XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } + } + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, + XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +/*====== Hash Streaming ======*/ + +XXH64_state_t* XXH64_createState(void) { + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) { + XXH_free(statePtr); + return XXH_OK; +} + +void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) { + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) { + XXH64_state_t state; /* using a local state to memcpy() in order to avoid + strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + +FORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state, + const void* input, size_t len, + XXH_endianess endian) { + if (input == NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \ + (XXH_ACCEPT_NULL_INPUT_POINTER >= 1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, + 32 - state->memsize); + state->v1 = + XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian)); + state->v2 = + XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian)); + state->v3 = + XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian)); + state->v4 = + XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p + 32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); + p += 8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); + p += 8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); + p += 8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); + p += 8; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd - p)); + state->memsize = (unsigned)(bEnd - p); + } + } + + return XXH_OK; +} + +XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input, + size_t len) { + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + +FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state, + XXH_endianess endian) { + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + PRIME64_5; + } + + h64 += (U64)state->total_len; + + return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, + XXH_aligned); +} + +unsigned long long XXH64_digest(const XXH64_state_t* state_in) { + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + +/*====== Canonical representation ======*/ + +void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) { + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) { + return XXH_readBE64(src); +} +} // namespace rocksdb diff --git a/src/rocksdb/util/xxhash.h b/src/rocksdb/util/xxhash.h new file mode 100644 index 00000000..88352ac7 --- /dev/null +++ b/src/rocksdb/util/xxhash.h @@ -0,0 +1,240 @@ +/* + xxHash - Fast Hash algorithm + Header File + Copyright (C) 2012-2014, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : http://code.google.com/p/xxhash/ +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. +*/ + +#pragma once + +#include <stdlib.h> + +#if !defined(__VMS) && \ + (defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) +#include <stdint.h> +#endif + +#if defined (__cplusplus) +namespace rocksdb { +#endif + + +//**************************** +// Type +//**************************** +/* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + + +//**************************** +// Simple Hash Functions +//**************************** + +unsigned int XXH32 (const void* input, int len, unsigned int seed); + +/* +XXH32() : + Calculate the 32-bits hash of sequence of length "len" stored at memory address "input". + The memory between input & input+len must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + This function successfully passes all SMHasher tests. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s + Note that "len" is type "int", which means it is limited to 2^31-1. + If your data is larger, use the advanced functions below. +*/ + + + +//**************************** +// Advanced Hash Functions +//**************************** + +void* XXH32_init (unsigned int seed); +XXH_errorcode XXH32_update (void* state, const void* input, int len); +unsigned int XXH32_digest (void* state); + +/* +These functions calculate the xxhash of an input provided in several small packets, +as opposed to an input provided as a single block. + +It must be started with : +void* XXH32_init() +The function returns a pointer which holds the state of calculation. + +This pointer must be provided as "void* state" parameter for XXH32_update(). +XXH32_update() can be called as many times as necessary. +The user must provide a valid (allocated) input. +The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. +Note that "len" is type "int", which means it is limited to 2^31-1. +If your data is larger, it is recommended to chunk your data into blocks +of size for example 2^30 (1GB) to avoid any "int" overflow issue. + +Finally, you can end the calculation anytime, by using XXH32_digest(). +This function returns the final 32-bits hash. +You must provide the same "void* state" parameter created by XXH32_init(). +Memory will be freed by XXH32_digest(). +*/ + + +int XXH32_sizeofState(); +XXH_errorcode XXH32_resetState(void* state, unsigned int seed); + +#define XXH32_SIZEOFSTATE 48 +typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t; +/* +These functions allow user application to make its own allocation for state. + +XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state. +Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer. +This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state. + +For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()), +use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields. +*/ + + +unsigned int XXH32_intermediateDigest (void* state); +/* +This function does the same as XXH32_digest(), generating a 32-bit hash, +but preserve memory context. +This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update(). +To free memory context, use XXH32_digest(), or free(). +*/ + + + +//**************************** +// Deprecated function names +//**************************** +// The following translations are provided to ease code transition +// You are encouraged to no longer this function names +#define XXH32_feed XXH32_update +#define XXH32_result XXH32_digest +#define XXH32_getIntermediateResult XXH32_intermediateDigest + +/*-********************************************************************** + * 64-bit hash + ************************************************************************/ +typedef unsigned long long XXH64_hash_t; + +/*! XXH64() : + Calculate the 64-bit hash of sequence of length "len" stored at memory + address "input". "seed" can be used to alter the result predictably. This + function runs faster on 64-bit systems, but slower on 32-bit systems (see + benchmark). +*/ +XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed); + +/*====== Streaming ======*/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH64_state_t* XXH64_createState(void); +XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed); +XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input, + size_t length); +XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr); + +/*====== Canonical representation ======*/ +typedef struct { + unsigned char digest[8]; +} XXH64_canonical_t; +void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +/* These definitions are only present to allow + * static allocation of XXH state, on stack or in a struct for example. + * Never **ever** use members directly. */ + +#if !defined(__VMS) && \ + (defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) + +struct XXH64_state_s { + uint64_t total_len; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t v4; + uint64_t mem64[4]; + uint32_t memsize; + uint32_t reserved[2]; /* never read nor write, might be removed in a future + version */ +}; /* typedef'd to XXH64_state_t */ + +#else + +#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */ +struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; + unsigned memsize; + unsigned reserved[2]; /* never read nor write, might be removed in a future + version */ +}; /* typedef'd to XXH64_state_t */ +#endif + +#endif + +#if defined (__cplusplus) +} // namespace rocksdb +#endif |