diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/memory | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/memory')
-rw-r--r-- | src/rocksdb/memory/allocator.h | 58 | ||||
-rw-r--r-- | src/rocksdb/memory/arena.cc | 234 | ||||
-rw-r--r-- | src/rocksdb/memory/arena.h | 141 | ||||
-rw-r--r-- | src/rocksdb/memory/arena_test.cc | 205 | ||||
-rw-r--r-- | src/rocksdb/memory/concurrent_arena.cc | 45 | ||||
-rw-r--r-- | src/rocksdb/memory/concurrent_arena.h | 215 | ||||
-rw-r--r-- | src/rocksdb/memory/jemalloc_nodump_allocator.cc | 269 | ||||
-rw-r--r-- | src/rocksdb/memory/jemalloc_nodump_allocator.h | 94 | ||||
-rw-r--r-- | src/rocksdb/memory/memkind_kmem_allocator.cc | 44 | ||||
-rw-r--r-- | src/rocksdb/memory/memkind_kmem_allocator.h | 43 | ||||
-rw-r--r-- | src/rocksdb/memory/memory_allocator.cc | 91 | ||||
-rw-r--r-- | src/rocksdb/memory/memory_allocator.h | 38 | ||||
-rw-r--r-- | src/rocksdb/memory/memory_allocator_test.cc | 240 | ||||
-rw-r--r-- | src/rocksdb/memory/memory_usage.h | 38 |
14 files changed, 1755 insertions, 0 deletions
diff --git a/src/rocksdb/memory/allocator.h b/src/rocksdb/memory/allocator.h new file mode 100644 index 000000000..0d7cd60a9 --- /dev/null +++ b/src/rocksdb/memory/allocator.h @@ -0,0 +1,58 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Abstract interface for allocating memory in blocks. This memory is freed +// when the allocator object is destroyed. See the Arena class for more info. + +#pragma once +#include <cerrno> +#include <cstddef> + +#include "rocksdb/write_buffer_manager.h" + +namespace ROCKSDB_NAMESPACE { + +class Logger; + +class Allocator { + public: + virtual ~Allocator() {} + + virtual char* Allocate(size_t bytes) = 0; + virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) = 0; + + virtual size_t BlockSize() const = 0; +}; + +class AllocTracker { + public: + explicit AllocTracker(WriteBufferManager* write_buffer_manager); + // No copying allowed + AllocTracker(const AllocTracker&) = delete; + void operator=(const AllocTracker&) = delete; + + ~AllocTracker(); + void Allocate(size_t bytes); + // Call when we're finished allocating memory so we can free it from + // the write buffer's limit. + void DoneAllocating(); + + void FreeMem(); + + bool is_freed() const { return write_buffer_manager_ == nullptr || freed_; } + + private: + WriteBufferManager* write_buffer_manager_; + std::atomic<size_t> bytes_allocated_; + bool done_allocating_; + bool freed_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/arena.cc b/src/rocksdb/memory/arena.cc new file mode 100644 index 000000000..10b8969b4 --- /dev/null +++ b/src/rocksdb/memory/arena.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "memory/arena.h" +#ifndef OS_WIN +#include <sys/mman.h> +#endif +#include <algorithm> + +#include "logging/logging.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// MSVC complains that it is already defined since it is static in the header. +#ifndef _MSC_VER +const size_t Arena::kInlineSize; +#endif + +const size_t Arena::kMinBlockSize = 4096; +const size_t Arena::kMaxBlockSize = 2u << 30; +static const int kAlignUnit = alignof(max_align_t); + +size_t OptimizeBlockSize(size_t block_size) { + // Make sure block_size is in optimal range + block_size = std::max(Arena::kMinBlockSize, block_size); + block_size = std::min(Arena::kMaxBlockSize, block_size); + + // make sure block_size is the multiple of kAlignUnit + if (block_size % kAlignUnit != 0) { + block_size = (1 + block_size / kAlignUnit) * kAlignUnit; + } + + return block_size; +} + +Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size) + : kBlockSize(OptimizeBlockSize(block_size)), tracker_(tracker) { + assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && + kBlockSize % kAlignUnit == 0); + TEST_SYNC_POINT_CALLBACK("Arena::Arena:0", const_cast<size_t*>(&kBlockSize)); + alloc_bytes_remaining_ = sizeof(inline_block_); + blocks_memory_ += alloc_bytes_remaining_; + aligned_alloc_ptr_ = inline_block_; + unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; +#ifdef MAP_HUGETLB + hugetlb_size_ = huge_page_size; + if (hugetlb_size_ && kBlockSize > hugetlb_size_) { + hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + } +#else + (void)huge_page_size; +#endif + if (tracker_ != nullptr) { + tracker_->Allocate(kInlineSize); + } +} + +Arena::~Arena() { + if (tracker_ != nullptr) { + assert(tracker_->is_freed()); + tracker_->FreeMem(); + } + for (const auto& block : blocks_) { + delete[] block; + } + +#ifdef MAP_HUGETLB + for (const auto& mmap_info : huge_blocks_) { + if (mmap_info.addr_ == nullptr) { + continue; + } + auto ret = munmap(mmap_info.addr_, mmap_info.length_); + if (ret != 0) { + // TODO(sdong): Better handling + } + } +#endif +} + +char* Arena::AllocateFallback(size_t bytes, bool aligned) { + if (bytes > kBlockSize / 4) { + ++irregular_block_num; + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + return AllocateNewBlock(bytes); + } + + // We waste the remaining space in the current block. + size_t size = 0; + char* block_head = nullptr; +#ifdef MAP_HUGETLB + if (hugetlb_size_) { + size = hugetlb_size_; + block_head = AllocateFromHugePage(size); + } +#endif + if (!block_head) { + size = kBlockSize; + block_head = AllocateNewBlock(size); + } + alloc_bytes_remaining_ = size - bytes; + + if (aligned) { + aligned_alloc_ptr_ = block_head + bytes; + unaligned_alloc_ptr_ = block_head + size; + return block_head; + } else { + aligned_alloc_ptr_ = block_head; + unaligned_alloc_ptr_ = block_head + size - bytes; + return unaligned_alloc_ptr_; + } +} + +char* Arena::AllocateFromHugePage(size_t bytes) { +#ifdef MAP_HUGETLB + if (hugetlb_size_ == 0) { + return nullptr; + } + // Reserve space in `huge_blocks_` before calling `mmap`. + // Use `emplace_back()` instead of `reserve()` to let std::vector manage its + // own memory and do fewer reallocations. + // + // - If `emplace_back` throws, no memory leaks because we haven't called + // `mmap` yet. + // - If `mmap` throws, no memory leaks because the vector will be cleaned up + // via RAII. + huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */); + + void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE), + (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0); + + if (addr == MAP_FAILED) { + return nullptr; + } + huge_blocks_.back() = MmapInfo(addr, bytes); + blocks_memory_ += bytes; + if (tracker_ != nullptr) { + tracker_->Allocate(bytes); + } + return reinterpret_cast<char*>(addr); +#else + (void)bytes; + return nullptr; +#endif +} + +char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, + Logger* logger) { + assert((kAlignUnit & (kAlignUnit - 1)) == + 0); // Pointer size should be a power of 2 + +#ifdef MAP_HUGETLB + if (huge_page_size > 0 && bytes > 0) { + // Allocate from a huge page TLB table. + size_t reserved_size = + ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; + assert(reserved_size >= bytes); + + char* addr = AllocateFromHugePage(reserved_size); + if (addr == nullptr) { + ROCKS_LOG_WARN(logger, + "AllocateAligned fail to allocate huge TLB pages: %s", + errnoStr(errno).c_str()); + // fail back to malloc + } else { + return addr; + } + } +#else + (void)huge_page_size; + (void)logger; +#endif + + size_t current_mod = + reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1); + size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = aligned_alloc_ptr_ + slop; + aligned_alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returns aligned memory + result = AllocateFallback(bytes, true /* aligned */); + } + assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + // Reserve space in `blocks_` before allocating memory via new. + // Use `emplace_back()` instead of `reserve()` to let std::vector manage its + // own memory and do fewer reallocations. + // + // - If `emplace_back` throws, no memory leaks because we haven't called `new` + // yet. + // - If `new` throws, no memory leaks because the vector will be cleaned up + // via RAII. + blocks_.emplace_back(nullptr); + + char* block = new char[block_bytes]; + size_t allocated_size; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + allocated_size = malloc_usable_size(block); +#ifndef NDEBUG + // It's hard to predict what malloc_usable_size() returns. + // A callback can allow users to change the costed size. + std::pair<size_t*, size_t*> pair(&allocated_size, &block_bytes); + TEST_SYNC_POINT_CALLBACK("Arena::AllocateNewBlock:0", &pair); +#endif // NDEBUG +#else + allocated_size = block_bytes; +#endif // ROCKSDB_MALLOC_USABLE_SIZE + blocks_memory_ += allocated_size; + if (tracker_ != nullptr) { + tracker_->Allocate(allocated_size); + } + blocks_.back() = block; + return block; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/arena.h b/src/rocksdb/memory/arena.h new file mode 100644 index 000000000..1de04c477 --- /dev/null +++ b/src/rocksdb/memory/arena.h @@ -0,0 +1,141 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Arena is an implementation of Allocator class. For a request of small size, +// it allocates a block with pre-defined block size. For a request of big +// size, it uses malloc to directly get the requested size. + +#pragma once +#ifndef OS_WIN +#include <sys/mman.h> +#endif +#include <assert.h> +#include <stdint.h> +#include <cerrno> +#include <cstddef> +#include <vector> +#include "memory/allocator.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena : public Allocator { + public: + // No copying allowed + Arena(const Arena&) = delete; + void operator=(const Arena&) = delete; + + static const size_t kInlineSize = 2048; + static const size_t kMinBlockSize; + static const size_t kMaxBlockSize; + + // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the + // supported hugepage size of the system), block allocation will try huge + // page TLB first. If allocation fails, will fall back to normal case. + explicit Arena(size_t block_size = kMinBlockSize, + AllocTracker* tracker = nullptr, size_t huge_page_size = 0); + ~Arena(); + + char* Allocate(size_t bytes) override; + + // huge_page_size: if >0, will try to allocate from huage page TLB. + // The argument will be the size of the page size for huge page TLB. Bytes + // will be rounded up to multiple of the page size to allocate through mmap + // anonymous option with huge page on. The extra space allocated will be + // wasted. If allocation fails, will fall back to normal case. To enable it, + // need to reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt for details. + // huge page allocation can fail. In this case it will fail back to + // normal cases. The messages will be logged to logger. So when calling with + // huge_page_tlb_size > 0, we highly recommend a logger is passed in. + // Otherwise, the error message will be printed out to stderr directly. + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) override; + + // Returns an estimate of the total memory usage of data allocated + // by the arena (exclude the space allocated but not yet used for future + // allocations). + size_t ApproximateMemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + alloc_bytes_remaining_; + } + + size_t MemoryAllocatedBytes() const { return blocks_memory_; } + + size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; } + + // If an allocation is too big, we'll allocate an irregular block with the + // same size of that allocation. + size_t IrregularBlockNum() const { return irregular_block_num; } + + size_t BlockSize() const override { return kBlockSize; } + + bool IsInInlineBlock() const { + return blocks_.empty() && huge_blocks_.empty(); + } + + private: + char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t)))); + // Number of bytes allocated in one block + const size_t kBlockSize; + // Array of new[] allocated memory blocks + using Blocks = std::vector<char*>; + Blocks blocks_; + + struct MmapInfo { + void* addr_; + size_t length_; + + MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {} + }; + std::vector<MmapInfo> huge_blocks_; + size_t irregular_block_num = 0; + + // Stats for current active block. + // For each block, we allocate aligned memory chucks from one end and + // allocate unaligned memory chucks from the other end. Otherwise the + // memory waste for alignment will be higher if we allocate both types of + // memory from one direction. + char* unaligned_alloc_ptr_ = nullptr; + char* aligned_alloc_ptr_ = nullptr; + // How many bytes left in currently active block? + size_t alloc_bytes_remaining_ = 0; + +#ifdef MAP_HUGETLB + size_t hugetlb_size_ = 0; +#endif // MAP_HUGETLB + char* AllocateFromHugePage(size_t bytes); + char* AllocateFallback(size_t bytes, bool aligned); + char* AllocateNewBlock(size_t block_bytes); + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_ = 0; + AllocTracker* tracker_; +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + unaligned_alloc_ptr_ -= bytes; + alloc_bytes_remaining_ -= bytes; + return unaligned_alloc_ptr_; + } + return AllocateFallback(bytes, false /* unaligned */); +} + +// check and adjust the block_size so that the return value is +// 1. in the range of [kMinBlockSize, kMaxBlockSize]. +// 2. the multiple of align unit. +extern size_t OptimizeBlockSize(size_t block_size); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/arena_test.cc b/src/rocksdb/memory/arena_test.cc new file mode 100644 index 000000000..96e69a932 --- /dev/null +++ b/src/rocksdb/memory/arena_test.cc @@ -0,0 +1,205 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "memory/arena.h" +#include "test_util/testharness.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +const size_t kHugePageSize = 2 * 1024 * 1024; +} // namespace +class ArenaTest : public testing::Test {}; + +TEST_F(ArenaTest, Empty) { Arena arena0; } + +namespace { +bool CheckMemoryAllocated(size_t allocated, size_t expected) { + // The value returned by Arena::MemoryAllocatedBytes() may be greater than + // the requested memory. We choose a somewhat arbitrary upper bound of + // max_expected = expected * 1.1 to detect critical overallocation. + size_t max_expected = expected + expected / 10; + return allocated >= expected && allocated <= max_expected; +} + +void MemoryAllocatedBytesTest(size_t huge_page_size) { + const int N = 17; + size_t req_sz; // requested size + size_t bsz = 32 * 1024; // block size + size_t expected_memory_allocated; + + Arena arena(bsz, nullptr, huge_page_size); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 12 * 1024; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated = req_sz * N + Arena::kInlineSize; + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + expected_memory_allocated); + + arena.Allocate(Arena::kInlineSize - 1); + + // requested size < quarter of a block: + // allocate a block with the default size, then try to use unused part + // of the block. So one new block will be allocated for the first + // Allocate(99) call. All the remaining calls won't lead to new allocation. + req_sz = 99; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + if (huge_page_size) { + ASSERT_TRUE( + CheckMemoryAllocated(arena.MemoryAllocatedBytes(), + expected_memory_allocated + bsz) || + CheckMemoryAllocated(arena.MemoryAllocatedBytes(), + expected_memory_allocated + huge_page_size)); + } else { + expected_memory_allocated += bsz; + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + expected_memory_allocated); + } + + // requested size > size of a block: + // allocate requested size separately + expected_memory_allocated = arena.MemoryAllocatedBytes(); + req_sz = 8 * 1024 * 1024; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated += req_sz * N; + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + expected_memory_allocated); +} + +// Make sure we didn't count the allocate but not used memory space in +// Arena::ApproximateMemoryUsage() +static void ApproximateMemoryUsageTest(size_t huge_page_size) { + const size_t kBlockSize = 4096; + const size_t kEntrySize = kBlockSize / 8; + const size_t kZero = 0; + Arena arena(kBlockSize, nullptr, huge_page_size); + ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); + + // allocate inline bytes + const size_t kAlignUnit = alignof(max_align_t); + EXPECT_TRUE(arena.IsInInlineBlock()); + arena.AllocateAligned(kAlignUnit); + EXPECT_TRUE(arena.IsInInlineBlock()); + arena.AllocateAligned(Arena::kInlineSize / 2 - (2 * kAlignUnit)); + EXPECT_TRUE(arena.IsInInlineBlock()); + arena.AllocateAligned(Arena::kInlineSize / 2); + EXPECT_TRUE(arena.IsInInlineBlock()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - kAlignUnit); + ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(), + Arena::kInlineSize); + + auto num_blocks = kBlockSize / kEntrySize; + + // first allocation + arena.AllocateAligned(kEntrySize); + EXPECT_FALSE(arena.IsInInlineBlock()); + auto mem_usage = arena.MemoryAllocatedBytes(); + if (huge_page_size) { + ASSERT_TRUE( + CheckMemoryAllocated(mem_usage, kBlockSize + Arena::kInlineSize) || + CheckMemoryAllocated(mem_usage, huge_page_size + Arena::kInlineSize)); + } else { + ASSERT_PRED2(CheckMemoryAllocated, mem_usage, + kBlockSize + Arena::kInlineSize); + } + auto usage = arena.ApproximateMemoryUsage(); + ASSERT_LT(usage, mem_usage); + for (size_t i = 1; i < num_blocks; ++i) { + arena.AllocateAligned(kEntrySize); + ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize); + EXPECT_FALSE(arena.IsInInlineBlock()); + usage = arena.ApproximateMemoryUsage(); + } + if (huge_page_size) { + ASSERT_TRUE(usage > mem_usage || + usage + huge_page_size - kBlockSize == mem_usage); + } else { + ASSERT_GT(usage, mem_usage); + } +} + +static void SimpleTest(size_t huge_page_size) { + std::vector<std::pair<size_t, char*>> allocated; + Arena arena(Arena::kMinBlockSize, nullptr, huge_page_size); + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) + ? rnd.Uniform(6000) + : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (unsigned int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.ApproximateMemoryUsage(), bytes); + if (i > N / 10) { + ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10); + } + } + for (unsigned int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (unsigned int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256)); + } + } +} +} // namespace + +TEST_F(ArenaTest, MemoryAllocatedBytes) { + MemoryAllocatedBytesTest(0); + MemoryAllocatedBytesTest(kHugePageSize); +} + +TEST_F(ArenaTest, ApproximateMemoryUsage) { + ApproximateMemoryUsageTest(0); + ApproximateMemoryUsageTest(kHugePageSize); +} + +TEST_F(ArenaTest, Simple) { + SimpleTest(0); + SimpleTest(kHugePageSize); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/memory/concurrent_arena.cc b/src/rocksdb/memory/concurrent_arena.cc new file mode 100644 index 000000000..1619bd93b --- /dev/null +++ b/src/rocksdb/memory/concurrent_arena.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "memory/concurrent_arena.h" + +#include <thread> + +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +thread_local size_t ConcurrentArena::tls_cpuid = 0; + +namespace { +// If the shard block size is too large, in the worst case, every core +// allocates a block without populate it. If the shared block size is +// 1MB, 64 cores will quickly allocate 64MB, and may quickly trigger a +// flush. Cap the size instead. +const size_t kMaxShardBlockSize = size_t{128 * 1024}; +} // namespace + +ConcurrentArena::ConcurrentArena(size_t block_size, AllocTracker* tracker, + size_t huge_page_size) + : shard_block_size_(std::min(kMaxShardBlockSize, block_size / 8)), + shards_(), + arena_(block_size, tracker, huge_page_size) { + Fixup(); +} + +ConcurrentArena::Shard* ConcurrentArena::Repick() { + auto shard_and_index = shards_.AccessElementAndIndex(); + // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we + // have repicked + tls_cpuid = shard_and_index.second | shards_.Size(); + return shard_and_index.first; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/concurrent_arena.h b/src/rocksdb/memory/concurrent_arena.h new file mode 100644 index 000000000..f14507d30 --- /dev/null +++ b/src/rocksdb/memory/concurrent_arena.h @@ -0,0 +1,215 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <atomic> +#include <memory> +#include <utility> + +#include "memory/allocator.h" +#include "memory/arena.h" +#include "port/lang.h" +#include "port/likely.h" +#include "util/core_local.h" +#include "util/mutexlock.h" +#include "util/thread_local.h" + +// Only generate field unused warning for padding array, or build under +// GCC 4.8.1 will fail. +#ifdef __clang__ +#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__)) +#else +#define ROCKSDB_FIELD_UNUSED +#endif // __clang__ + +namespace ROCKSDB_NAMESPACE { + +class Logger; + +// ConcurrentArena wraps an Arena. It makes it thread safe using a fast +// inlined spinlock, and adds small per-core allocation caches to avoid +// contention for small allocations. To avoid any memory waste from the +// per-core shards, they are kept small, they are lazily instantiated +// only if ConcurrentArena actually notices concurrent use, and they +// adjust their size so that there is no fragmentation waste when the +// shard blocks are allocated from the underlying main arena. +class ConcurrentArena : public Allocator { + public: + // block_size and huge_page_size are the same as for Arena (and are + // in fact just passed to the constructor of arena_. The core-local + // shards compute their shard_block_size as a fraction of block_size + // that varies according to the hardware concurrency level. + explicit ConcurrentArena(size_t block_size = Arena::kMinBlockSize, + AllocTracker* tracker = nullptr, + size_t huge_page_size = 0); + + char* Allocate(size_t bytes) override { + return AllocateImpl(bytes, false /*force_arena*/, + [this, bytes]() { return arena_.Allocate(bytes); }); + } + + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) override { + size_t rounded_up = ((bytes - 1) | (sizeof(void*) - 1)) + 1; + assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) && + (rounded_up % sizeof(void*)) == 0); + + return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, + [this, rounded_up, huge_page_size, logger]() { + return arena_.AllocateAligned(rounded_up, + huge_page_size, logger); + }); + } + + size_t ApproximateMemoryUsage() const { + std::unique_lock<SpinMutex> lock(arena_mutex_, std::defer_lock); + lock.lock(); + return arena_.ApproximateMemoryUsage() - ShardAllocatedAndUnused(); + } + + size_t MemoryAllocatedBytes() const { + return memory_allocated_bytes_.load(std::memory_order_relaxed); + } + + size_t AllocatedAndUnused() const { + return arena_allocated_and_unused_.load(std::memory_order_relaxed) + + ShardAllocatedAndUnused(); + } + + size_t IrregularBlockNum() const { + return irregular_block_num_.load(std::memory_order_relaxed); + } + + size_t BlockSize() const override { return arena_.BlockSize(); } + + private: + struct Shard { + char padding[40] ROCKSDB_FIELD_UNUSED; + mutable SpinMutex mutex; + char* free_begin_; + std::atomic<size_t> allocated_and_unused_; + + Shard() : free_begin_(nullptr), allocated_and_unused_(0) {} + }; + + static thread_local size_t tls_cpuid; + + char padding0[56] ROCKSDB_FIELD_UNUSED; + + size_t shard_block_size_; + + CoreLocalArray<Shard> shards_; + + Arena arena_; + mutable SpinMutex arena_mutex_; + std::atomic<size_t> arena_allocated_and_unused_; + std::atomic<size_t> memory_allocated_bytes_; + std::atomic<size_t> irregular_block_num_; + + char padding1[56] ROCKSDB_FIELD_UNUSED; + + Shard* Repick(); + + size_t ShardAllocatedAndUnused() const { + size_t total = 0; + for (size_t i = 0; i < shards_.Size(); ++i) { + total += shards_.AccessAtCore(i)->allocated_and_unused_.load( + std::memory_order_relaxed); + } + return total; + } + + template <typename Func> + char* AllocateImpl(size_t bytes, bool force_arena, const Func& func) { + size_t cpu; + + // Go directly to the arena if the allocation is too large, or if + // we've never needed to Repick() and the arena mutex is available + // with no waiting. This keeps the fragmentation penalty of + // concurrency zero unless it might actually confer an advantage. + std::unique_lock<SpinMutex> arena_lock(arena_mutex_, std::defer_lock); + if (bytes > shard_block_size_ / 4 || force_arena || + ((cpu = tls_cpuid) == 0 && + !shards_.AccessAtCore(0)->allocated_and_unused_.load( + std::memory_order_relaxed) && + arena_lock.try_lock())) { + if (!arena_lock.owns_lock()) { + arena_lock.lock(); + } + auto rv = func(); + Fixup(); + return rv; + } + + // pick a shard from which to allocate + Shard* s = shards_.AccessAtCore(cpu & (shards_.Size() - 1)); + if (!s->mutex.try_lock()) { + s = Repick(); + s->mutex.lock(); + } + std::unique_lock<SpinMutex> lock(s->mutex, std::adopt_lock); + + size_t avail = s->allocated_and_unused_.load(std::memory_order_relaxed); + if (avail < bytes) { + // reload + std::lock_guard<SpinMutex> reload_lock(arena_mutex_); + + // If the arena's current block is within a factor of 2 of the right + // size, we adjust our request to avoid arena waste. + auto exact = arena_allocated_and_unused_.load(std::memory_order_relaxed); + assert(exact == arena_.AllocatedAndUnused()); + + if (exact >= bytes && arena_.IsInInlineBlock()) { + // If we haven't exhausted arena's inline block yet, allocate from arena + // directly. This ensures that we'll do the first few small allocations + // without allocating any blocks. + // In particular this prevents empty memtables from using + // disproportionately large amount of memory: a memtable allocates on + // the order of 1 KB of memory when created; we wouldn't want to + // allocate a full arena block (typically a few megabytes) for that, + // especially if there are thousands of empty memtables. + auto rv = func(); + Fixup(); + return rv; + } + + avail = exact >= shard_block_size_ / 2 && exact < shard_block_size_ * 2 + ? exact + : shard_block_size_; + s->free_begin_ = arena_.AllocateAligned(avail); + Fixup(); + } + s->allocated_and_unused_.store(avail - bytes, std::memory_order_relaxed); + + char* rv; + if ((bytes % sizeof(void*)) == 0) { + // aligned allocation from the beginning + rv = s->free_begin_; + s->free_begin_ += bytes; + } else { + // unaligned from the end + rv = s->free_begin_ + avail - bytes; + } + return rv; + } + + void Fixup() { + arena_allocated_and_unused_.store(arena_.AllocatedAndUnused(), + std::memory_order_relaxed); + memory_allocated_bytes_.store(arena_.MemoryAllocatedBytes(), + std::memory_order_relaxed); + irregular_block_num_.store(arena_.IrregularBlockNum(), + std::memory_order_relaxed); + } + + ConcurrentArena(const ConcurrentArena&) = delete; + ConcurrentArena& operator=(const ConcurrentArena&) = delete; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/jemalloc_nodump_allocator.cc b/src/rocksdb/memory/jemalloc_nodump_allocator.cc new file mode 100644 index 000000000..62ee661d2 --- /dev/null +++ b/src/rocksdb/memory/jemalloc_nodump_allocator.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "memory/jemalloc_nodump_allocator.h" + +#include <string> +#include <thread> + +#include "port/likely.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr}; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +static std::unordered_map<std::string, OptionTypeInfo> jemalloc_type_info = { +#ifndef ROCKSDB_LITE + {"limit_tcache_size", + {offsetof(struct JemallocAllocatorOptions, limit_tcache_size), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"tcache_size_lower_bound", + {offsetof(struct JemallocAllocatorOptions, tcache_size_lower_bound), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"tcache_size_upper_bound", + {offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +bool JemallocNodumpAllocator::IsSupported(std::string* why) { +#ifndef ROCKSDB_JEMALLOC + *why = "Not compiled with ROCKSDB_JEMALLOC"; + return false; +#else + static const std::string unsupported = + "JemallocNodumpAllocator only available with jemalloc version >= 5 " + "and MADV_DONTDUMP is available."; + if (!HasJemalloc()) { + *why = unsupported; + return false; + } +#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + *why = unsupported; + return false; +#else + return true; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +#endif // ROCKSDB_MALLOC +} + +JemallocNodumpAllocator::JemallocNodumpAllocator( + JemallocAllocatorOptions& options) + : options_(options), +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache), +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + arena_index_(0) { + RegisterOptions(&options_, &jemalloc_type_info); +} + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +JemallocNodumpAllocator::~JemallocNodumpAllocator() { + // Destroy tcache before destroying arena. + autovector<void*> tcache_list; + tcache_.Scrape(&tcache_list, nullptr); + for (void* tcache_index : tcache_list) { + DestroyThreadSpecificCache(tcache_index); + } + if (arena_index_ > 0) { + // Destroy arena. Silently ignore error. + Status s = DestroyArena(arena_index_); + assert(s.ok()); + s.PermitUncheckedError(); + } +} + +size_t JemallocNodumpAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return malloc_usable_size(static_cast<void*>(p)); +} + +void* JemallocNodumpAllocator::Allocate(size_t size) { + int tcache_flag = GetThreadSpecificCache(size); + return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag); +} + +void JemallocNodumpAllocator::Deallocate(void* p) { + // Obtain tcache. + size_t size = 0; + if (options_.limit_tcache_size) { + size = malloc_usable_size(p); + } + int tcache_flag = GetThreadSpecificCache(size); + // No need to pass arena index to dallocx(). Jemalloc will find arena index + // from its own metadata. + dallocx(p, tcache_flag); +} + +Status JemallocNodumpAllocator::InitializeArenas() { + // Create arena. + size_t arena_index_size = sizeof(arena_index_); + int ret = + mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to create jemalloc arena, error code: " + + std::to_string(ret)); + } + assert(arena_index_ != 0); + + // Read existing hooks. + std::string key = "arena." + std::to_string(arena_index_) + ".extent_hooks"; + extent_hooks_t* hooks; + size_t hooks_size = sizeof(hooks); + ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to read existing hooks, error code: " + + std::to_string(ret)); + } + + // Store existing alloc. + extent_alloc_t* original_alloc = hooks->alloc; + extent_alloc_t* expected = nullptr; + bool success = + JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( + expected, original_alloc); + if (!success && original_alloc != expected) { + return Status::Incomplete("Original alloc conflict."); + } + + // Set the custom hook. + arena_hooks_.reset(new extent_hooks_t(*hooks)); + arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc; + extent_hooks_t* hooks_ptr = arena_hooks_.get(); + ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); + if (ret != 0) { + return Status::Incomplete("Failed to set custom hook, error code: " + + std::to_string(ret)); + } + return Status::OK(); +} + +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +Status JemallocNodumpAllocator::PrepareOptions( + const ConfigOptions& config_options) { + std::string message; + + if (!IsSupported(&message)) { + return Status::NotSupported(message); + } else if (options_.limit_tcache_size && + options_.tcache_size_lower_bound >= + options_.tcache_size_upper_bound) { + return Status::InvalidArgument( + "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); + } else if (IsMutable()) { + Status s = MemoryAllocator::PrepareOptions(config_options); +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + if (s.ok()) { + s = InitializeArenas(); + } +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + return s; + } else { + // Already prepared + return Status::OK(); + } +} + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) { + // We always enable tcache. The only corner case is when there are a ton of + // threads accessing with low frequency, then it could consume a lot of + // memory (may reach # threads * ~1MB) without bringing too much benefit. + if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound || + size > options_.tcache_size_upper_bound)) { + return MALLOCX_TCACHE_NONE; + } + unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get()); + if (UNLIKELY(tcache_index == nullptr)) { + // Instantiate tcache. + tcache_index = new unsigned(0); + size_t tcache_index_size = sizeof(unsigned); + int ret = + mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0); + if (ret != 0) { + // No good way to expose the error. Silently disable tcache. + delete tcache_index; + return MALLOCX_TCACHE_NONE; + } + tcache_.Reset(static_cast<void*>(tcache_index)); + } + return MALLOCX_TCACHE(*tcache_index); +} +void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr, + size_t size, size_t alignment, bool* zero, + bool* commit, unsigned arena_ind) { + extent_alloc_t* original_alloc = + original_alloc_.load(std::memory_order_relaxed); + assert(original_alloc != nullptr); + void* result = original_alloc(extent, new_addr, size, alignment, zero, commit, + arena_ind); + if (result != nullptr) { + int ret = madvise(result, size, MADV_DONTDUMP); + if (ret != 0) { + fprintf( + stderr, + "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d", + ret); + assert(false); + } + } + return result; +} + +Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) { + assert(arena_index != 0); + std::string key = "arena." + std::to_string(arena_index) + ".destroy"; + int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to destroy jemalloc arena, error code: " + + std::to_string(ret)); + } + return Status::OK(); +} + +void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) { + assert(ptr != nullptr); + unsigned* tcache_index = static_cast<unsigned*>(ptr); + size_t tcache_index_size = sizeof(unsigned); + int ret __attribute__((__unused__)) = + mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size); + // Silently ignore error. + assert(ret == 0); + delete tcache_index; +} + +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr<MemoryAllocator>* memory_allocator) { + if (memory_allocator == nullptr) { + return Status::InvalidArgument("memory_allocator must be non-null."); + } +#ifndef ROCKSDB_JEMALLOC + (void)options; + return Status::NotSupported("Not compiled with JEMALLOC"); +#else + std::unique_ptr<MemoryAllocator> allocator( + new JemallocNodumpAllocator(options)); + Status s = allocator->PrepareOptions(ConfigOptions()); + if (s.ok()) { + memory_allocator->reset(allocator.release()); + } + return s; +#endif +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/jemalloc_nodump_allocator.h b/src/rocksdb/memory/jemalloc_nodump_allocator.h new file mode 100644 index 000000000..a1e1547d7 --- /dev/null +++ b/src/rocksdb/memory/jemalloc_nodump_allocator.h @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <atomic> +#include <vector> + +#include "port/jemalloc_helper.h" +#include "port/port.h" +#include "rocksdb/memory_allocator.h" +#include "util/thread_local.h" +#include "utilities/memory_allocators.h" + +#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX) + +#include <sys/mman.h> + +#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP) +#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP +#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX + +namespace ROCKSDB_NAMESPACE { +class JemallocNodumpAllocator : public BaseMemoryAllocator { + public: + explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options); +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + ~JemallocNodumpAllocator(); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + + static const char* kClassName() { return "JemallocNodumpAllocator"; } + const char* Name() const override { return kClassName(); } + static bool IsSupported() { + std::string unused; + return IsSupported(&unused); + } + static bool IsSupported(std::string* why); + bool IsMutable() const { return arena_index_ == 0; } + + Status PrepareOptions(const ConfigOptions& config_options) override; + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + void* Allocate(size_t size) override; + void Deallocate(void* p) override; + size_t UsableSize(void* p, size_t allocation_size) const override; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + + private: +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + Status InitializeArenas(); + + friend Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr<MemoryAllocator>* memory_allocator); + + // Custom alloc hook to replace jemalloc default alloc. + static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size, + size_t alignment, bool* zero, bool* commit, + unsigned arena_ind); + + // Destroy arena on destruction of the allocator, or on failure. + static Status DestroyArena(unsigned arena_index); + + // Destroy tcache on destruction of the allocator, or thread exit. + static void DestroyThreadSpecificCache(void* ptr); + + // Get or create tcache. Return flag suitable to use with `mallocx`: + // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc). + int GetThreadSpecificCache(size_t size); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + JemallocAllocatorOptions options_; + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + // A function pointer to jemalloc default alloc. Use atomic to make sure + // NewJemallocNodumpAllocator is thread-safe. + // + // Hack: original_alloc_ needs to be static for Alloc() to access it. + // alloc needs to be static to pass to jemalloc as function pointer. + static std::atomic<extent_alloc_t*> original_alloc_; + + // Custom hooks has to outlive corresponding arena. + std::unique_ptr<extent_hooks_t> arena_hooks_; + + // Hold thread-local tcache index. + ThreadLocalPtr tcache_; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + + // Arena index. + unsigned arena_index_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/memkind_kmem_allocator.cc b/src/rocksdb/memory/memkind_kmem_allocator.cc new file mode 100644 index 000000000..635c2210e --- /dev/null +++ b/src/rocksdb/memory/memkind_kmem_allocator.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef MEMKIND +#include <memkind.h> +#endif // MEMKIND + +#include "memory/memkind_kmem_allocator.h" + +namespace ROCKSDB_NAMESPACE { +Status MemkindKmemAllocator::PrepareOptions(const ConfigOptions& options) { + std::string message; + if (!IsSupported(&message)) { + return Status::NotSupported(message); + } else { + return MemoryAllocator::PrepareOptions(options); + } +} + +#ifdef MEMKIND +void* MemkindKmemAllocator::Allocate(size_t size) { + void* p = memkind_malloc(MEMKIND_DAX_KMEM, size); + if (p == NULL) { + throw std::bad_alloc(); + } + return p; +} + +void MemkindKmemAllocator::Deallocate(void* p) { + memkind_free(MEMKIND_DAX_KMEM, p); +} + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +size_t MemkindKmemAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return memkind_malloc_usable_size(MEMKIND_DAX_KMEM, p); +} +#endif // ROCKSDB_MALLOC_USABLE_SIZE +#endif // MEMKIND + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/memkind_kmem_allocator.h b/src/rocksdb/memory/memkind_kmem_allocator.h new file mode 100644 index 000000000..7176f17e3 --- /dev/null +++ b/src/rocksdb/memory/memkind_kmem_allocator.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/memory_allocator.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { + +class MemkindKmemAllocator : public BaseMemoryAllocator { + public: + static const char* kClassName() { return "MemkindKmemAllocator"; } + const char* Name() const override { return kClassName(); } + static bool IsSupported() { + std::string unused; + return IsSupported(&unused); + } + + static bool IsSupported(std::string* msg) { +#ifdef MEMKIND + (void)msg; + return true; +#else + *msg = "Not compiled with MemKind"; + return false; +#endif + } + Status PrepareOptions(const ConfigOptions& options) override; + +#ifdef MEMKIND + void* Allocate(size_t size) override; + void Deallocate(void* p) override; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + size_t UsableSize(void* p, size_t /*allocation_size*/) const override; +#endif +#endif // MEMKIND +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/memory_allocator.cc b/src/rocksdb/memory/memory_allocator.cc new file mode 100644 index 000000000..34dce9bb6 --- /dev/null +++ b/src/rocksdb/memory/memory_allocator.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/memory_allocator.h" + +#include "memory/jemalloc_nodump_allocator.h" +#include "memory/memkind_kmem_allocator.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map<std::string, OptionTypeInfo> ma_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", OptionTypeInfo::AsCustomSharedPtr<MemoryAllocator>( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinAllocators(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory<MemoryAllocator>( + DefaultMemoryAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard, + std::string* /*errmsg*/) { + guard->reset(new DefaultMemoryAllocator()); + return guard->get(); + }); + library.AddFactory<MemoryAllocator>( + CountedMemoryAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard, + std::string* /*errmsg*/) { + guard->reset(new CountedMemoryAllocator( + std::make_shared<DefaultMemoryAllocator>())); + return guard->get(); + }); + library.AddFactory<MemoryAllocator>( + JemallocNodumpAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard, + std::string* errmsg) { + if (JemallocNodumpAllocator::IsSupported(errmsg)) { + JemallocAllocatorOptions options; + guard->reset(new JemallocNodumpAllocator(options)); + } + return guard->get(); + }); + library.AddFactory<MemoryAllocator>( + MemkindKmemAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard, + std::string* errmsg) { + if (MemkindKmemAllocator::IsSupported(errmsg)) { + guard->reset(new MemkindKmemAllocator()); + } + return guard->get(); + }); + size_t num_types; + return static_cast<int>(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE +} // namespace + +MemoryAllocatorWrapper::MemoryAllocatorWrapper( + const std::shared_ptr<MemoryAllocator>& t) + : target_(t) { + RegisterOptions("", &target_, &ma_wrapper_type_info); +} + +Status MemoryAllocator::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr<MemoryAllocator>* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinAllocators(*(ObjectLibrary::Default().get()), ""); + }); +#else + if (value == DefaultMemoryAllocator::kClassName()) { + result->reset(new DefaultMemoryAllocator()); + return Status::OK(); + } +#endif // ROCKSDB_LITE + ConfigOptions copy = options; + copy.invoke_prepare_options = true; + return LoadManagedObject<MemoryAllocator>(copy, value, result); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/memory_allocator.h b/src/rocksdb/memory/memory_allocator.h new file mode 100644 index 000000000..f1a548659 --- /dev/null +++ b/src/rocksdb/memory/memory_allocator.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include "rocksdb/memory_allocator.h" + +namespace ROCKSDB_NAMESPACE { + +struct CustomDeleter { + CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {} + + void operator()(char* ptr) const { + if (allocator) { + allocator->Deallocate(reinterpret_cast<void*>(ptr)); + } else { + delete[] ptr; + } + } + + MemoryAllocator* allocator; +}; + +using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>; + +inline CacheAllocationPtr AllocateBlock(size_t size, + MemoryAllocator* allocator) { + if (allocator) { + auto block = reinterpret_cast<char*>(allocator->Allocate(size)); + return CacheAllocationPtr(block, allocator); + } + return CacheAllocationPtr(new char[size]); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/memory/memory_allocator_test.cc b/src/rocksdb/memory/memory_allocator_test.cc new file mode 100644 index 000000000..6afde7165 --- /dev/null +++ b/src/rocksdb/memory/memory_allocator_test.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <cstdio> + +#include "memory/jemalloc_nodump_allocator.h" +#include "memory/memkind_kmem_allocator.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "table/block_based/block_based_table_factory.h" +#include "test_util/testharness.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: the tests do not work in LITE mode due to relying on +// `CreateFromString()` to create non-default memory allocators. +#ifndef ROCKSDB_LITE + +class MemoryAllocatorTest + : public testing::Test, + public ::testing::WithParamInterface<std::tuple<std::string, bool>> { + public: + MemoryAllocatorTest() { + std::tie(id_, supported_) = GetParam(); + Status s = + MemoryAllocator::CreateFromString(ConfigOptions(), id_, &allocator_); + EXPECT_EQ(supported_, s.ok()); + } + bool IsSupported() { return supported_; } + + std::shared_ptr<MemoryAllocator> allocator_; + std::string id_; + + private: + bool supported_; +}; + +TEST_P(MemoryAllocatorTest, Allocate) { + if (!IsSupported()) { + return; + } + void* p = allocator_->Allocate(1024); + ASSERT_NE(p, nullptr); + size_t size = allocator_->UsableSize(p, 1024); + ASSERT_GE(size, 1024); + allocator_->Deallocate(p); +} + +TEST_P(MemoryAllocatorTest, CreateAllocator) { + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.ignore_unsupported_options = false; + std::shared_ptr<MemoryAllocator> orig, copy; + Status s = MemoryAllocator::CreateFromString(config_options, id_, &orig); + if (!IsSupported()) { + ASSERT_TRUE(s.IsNotSupported()); + } else { + ASSERT_OK(s); + ASSERT_NE(orig, nullptr); +#ifndef ROCKSDB_LITE + std::string str = orig->ToString(config_options); + ASSERT_OK(MemoryAllocator::CreateFromString(config_options, str, ©)); + ASSERT_EQ(orig, copy); +#endif // ROCKSDB_LITE + } +} + +TEST_P(MemoryAllocatorTest, DatabaseBlockCache) { + if (!IsSupported()) { + // Check if a memory node is available for allocation + } + + // Create database with block cache using the MemoryAllocator + Options options; + std::string dbname = test::PerThreadDBPath("allocator_test"); + ASSERT_OK(DestroyDB(dbname, options)); + + options.create_if_missing = true; + BlockBasedTableOptions table_options; + auto cache = NewLRUCache(1024 * 1024, 6, false, 0.0, allocator_); + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DB* db = nullptr; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + ASSERT_LE(cache->GetUsage(), 104); // Cache will contain stats + + // Write 2kB (200 values, each 10 bytes) + int num_keys = 200; + WriteOptions wo; + std::string val = "0123456789"; + for (int i = 0; i < num_keys; i++) { + std::string key = std::to_string(i); + s = db->Put(wo, Slice(key), Slice(val)); + ASSERT_OK(s); + } + ASSERT_OK(db->Flush(FlushOptions())); // Flush all data from memtable so that + // reads are from block cache + + // Read and check block cache usage + ReadOptions ro; + std::string result; + for (int i = 0; i < num_keys; i++) { + std::string key = std::to_string(i); + s = db->Get(ro, key, &result); + ASSERT_OK(s); + ASSERT_EQ(result, val); + } + ASSERT_GT(cache->GetUsage(), 2000); + + // Close database + s = db->Close(); + ASSERT_OK(s); + delete db; + ASSERT_OK(DestroyDB(dbname, options)); +} + +class CreateMemoryAllocatorTest : public testing::Test { + public: + CreateMemoryAllocatorTest() { + config_options_.ignore_unknown_options = false; + config_options_.ignore_unsupported_options = false; + } + ConfigOptions config_options_; +}; + +TEST_F(CreateMemoryAllocatorTest, JemallocOptionsTest) { + std::shared_ptr<MemoryAllocator> allocator; + std::string id = std::string("id=") + JemallocNodumpAllocator::kClassName(); + Status s = MemoryAllocator::CreateFromString(config_options_, id, &allocator); + if (!JemallocNodumpAllocator::IsSupported()) { + ASSERT_NOK(s); + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + return; + } + ASSERT_OK(s); + ASSERT_NE(allocator, nullptr); + JemallocAllocatorOptions jopts; + auto opts = allocator->GetOptions<JemallocAllocatorOptions>(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size); + ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound); + ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound); + + ASSERT_NOK(MemoryAllocator::CreateFromString( + config_options_, + id + "; limit_tcache_size=true; tcache_size_lower_bound=4096; " + "tcache_size_upper_bound=1024", + &allocator)); + ASSERT_OK(MemoryAllocator::CreateFromString( + config_options_, + id + "; limit_tcache_size=false; tcache_size_lower_bound=4096; " + "tcache_size_upper_bound=1024", + &allocator)); + opts = allocator->GetOptions<JemallocAllocatorOptions>(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->limit_tcache_size, false); + ASSERT_EQ(opts->tcache_size_lower_bound, 4096U); + ASSERT_EQ(opts->tcache_size_upper_bound, 1024U); + ASSERT_OK(MemoryAllocator::CreateFromString( + config_options_, + id + "; limit_tcache_size=true; tcache_size_upper_bound=4096; " + "tcache_size_lower_bound=1024", + &allocator)); + opts = allocator->GetOptions<JemallocAllocatorOptions>(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->limit_tcache_size, true); + ASSERT_EQ(opts->tcache_size_lower_bound, 1024U); + ASSERT_EQ(opts->tcache_size_upper_bound, 4096U); +} + +TEST_F(CreateMemoryAllocatorTest, NewJemallocNodumpAllocator) { + JemallocAllocatorOptions jopts; + std::shared_ptr<MemoryAllocator> allocator; + + jopts.limit_tcache_size = true; + jopts.tcache_size_lower_bound = 2 * 1024; + jopts.tcache_size_upper_bound = 1024; + + ASSERT_NOK(NewJemallocNodumpAllocator(jopts, nullptr)); + Status s = NewJemallocNodumpAllocator(jopts, &allocator); + std::string msg; + if (!JemallocNodumpAllocator::IsSupported(&msg)) { + ASSERT_NOK(s); + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + return; + } + ASSERT_NOK(s); // Invalid options + ASSERT_EQ(allocator, nullptr); + + jopts.tcache_size_upper_bound = 4 * 1024; + ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator)); + ASSERT_NE(allocator, nullptr); + auto opts = allocator->GetOptions<JemallocAllocatorOptions>(); + ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound); + ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound); + ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size); + + jopts.limit_tcache_size = false; + ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator)); + ASSERT_NE(allocator, nullptr); + opts = allocator->GetOptions<JemallocAllocatorOptions>(); + ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound); + ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound); + ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size); +} + +INSTANTIATE_TEST_CASE_P(DefaultMemoryAllocator, MemoryAllocatorTest, + ::testing::Values(std::make_tuple( + DefaultMemoryAllocator::kClassName(), true))); +#ifdef MEMKIND +INSTANTIATE_TEST_CASE_P( + MemkindkMemAllocator, MemoryAllocatorTest, + ::testing::Values(std::make_tuple(MemkindKmemAllocator::kClassName(), + MemkindKmemAllocator::IsSupported()))); +#endif // MEMKIND + +#ifdef ROCKSDB_JEMALLOC +INSTANTIATE_TEST_CASE_P( + JemallocNodumpAllocator, MemoryAllocatorTest, + ::testing::Values(std::make_tuple(JemallocNodumpAllocator::kClassName(), + JemallocNodumpAllocator::IsSupported()))); +#endif // ROCKSDB_JEMALLOC + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/memory/memory_usage.h b/src/rocksdb/memory/memory_usage.h new file mode 100644 index 000000000..76b9bd130 --- /dev/null +++ b/src/rocksdb/memory/memory_usage.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cstddef> +#include <unordered_map> +#ifdef USE_FOLLY +#include <folly/container/F14Map.h> +#endif + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Helper methods to estimate memroy usage by std containers. + +template <class Key, class Value, class Hash> +size_t ApproximateMemoryUsage( + const std::unordered_map<Key, Value, Hash>& umap) { + using Map = std::unordered_map<Key, Value, Hash>; + return sizeof(umap) + + // Size of all items plus a next pointer for each item. + (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() + + // Size of hash buckets. + umap.bucket_count() * sizeof(void*); +} + +#ifdef USE_FOLLY +template <class Key, class Value, class Hash> +size_t ApproximateMemoryUsage(const folly::F14FastMap<Key, Value, Hash>& umap) { + return sizeof(umap) + umap.getAllocatedMemorySize(); +} +#endif + +} // namespace ROCKSDB_NAMESPACE |