summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/memory
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/memory
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/memory')
-rw-r--r--src/rocksdb/memory/allocator.h57
-rw-r--r--src/rocksdb/memory/arena.cc233
-rw-r--r--src/rocksdb/memory/arena.h141
-rw-r--r--src/rocksdb/memory/arena_test.cc204
-rw-r--r--src/rocksdb/memory/concurrent_arena.cc47
-rw-r--r--src/rocksdb/memory/concurrent_arena.h215
-rw-r--r--src/rocksdb/memory/jemalloc_nodump_allocator.cc206
-rw-r--r--src/rocksdb/memory/jemalloc_nodump_allocator.h78
-rw-r--r--src/rocksdb/memory/memory_allocator.h38
-rw-r--r--src/rocksdb/memory/memory_usage.h25
10 files changed, 1244 insertions, 0 deletions
diff --git a/src/rocksdb/memory/allocator.h b/src/rocksdb/memory/allocator.h
new file mode 100644
index 000000000..002ad5f1d
--- /dev/null
+++ b/src/rocksdb/memory/allocator.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Abstract interface for allocating memory in blocks. This memory is freed
+// when the allocator object is destroyed. See the Arena class for more info.
+
+#pragma once
+#include <cerrno>
+#include <cstddef>
+#include "rocksdb/write_buffer_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Logger;
+
+class Allocator {
+ public:
+ virtual ~Allocator() {}
+
+ virtual char* Allocate(size_t bytes) = 0;
+ virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+ Logger* logger = nullptr) = 0;
+
+ virtual size_t BlockSize() const = 0;
+};
+
+class AllocTracker {
+ public:
+ explicit AllocTracker(WriteBufferManager* write_buffer_manager);
+ // No copying allowed
+ AllocTracker(const AllocTracker&) = delete;
+ void operator=(const AllocTracker&) = delete;
+
+ ~AllocTracker();
+ void Allocate(size_t bytes);
+ // Call when we're finished allocating memory so we can free it from
+ // the write buffer's limit.
+ void DoneAllocating();
+
+ void FreeMem();
+
+ bool is_freed() const { return write_buffer_manager_ == nullptr || freed_; }
+
+ private:
+ WriteBufferManager* write_buffer_manager_;
+ std::atomic<size_t> bytes_allocated_;
+ bool done_allocating_;
+ bool freed_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/arena.cc b/src/rocksdb/memory/arena.cc
new file mode 100644
index 000000000..ed46459d9
--- /dev/null
+++ b/src/rocksdb/memory/arena.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memory/arena.h"
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
+#include <algorithm>
+#include "logging/logging.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// MSVC complains that it is already defined since it is static in the header.
+#ifndef _MSC_VER
+const size_t Arena::kInlineSize;
+#endif
+
+const size_t Arena::kMinBlockSize = 4096;
+const size_t Arena::kMaxBlockSize = 2u << 30;
+static const int kAlignUnit = alignof(max_align_t);
+
+size_t OptimizeBlockSize(size_t block_size) {
+ // Make sure block_size is in optimal range
+ block_size = std::max(Arena::kMinBlockSize, block_size);
+ block_size = std::min(Arena::kMaxBlockSize, block_size);
+
+ // make sure block_size is the multiple of kAlignUnit
+ if (block_size % kAlignUnit != 0) {
+ block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
+ }
+
+ return block_size;
+}
+
+Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size)
+ : kBlockSize(OptimizeBlockSize(block_size)), tracker_(tracker) {
+ assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+ kBlockSize % kAlignUnit == 0);
+ TEST_SYNC_POINT_CALLBACK("Arena::Arena:0", const_cast<size_t*>(&kBlockSize));
+ alloc_bytes_remaining_ = sizeof(inline_block_);
+ blocks_memory_ += alloc_bytes_remaining_;
+ aligned_alloc_ptr_ = inline_block_;
+ unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_;
+#ifdef MAP_HUGETLB
+ hugetlb_size_ = huge_page_size;
+ if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
+ hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
+ }
+#else
+ (void)huge_page_size;
+#endif
+ if (tracker_ != nullptr) {
+ tracker_->Allocate(kInlineSize);
+ }
+}
+
+Arena::~Arena() {
+ if (tracker_ != nullptr) {
+ assert(tracker_->is_freed());
+ tracker_->FreeMem();
+ }
+ for (const auto& block : blocks_) {
+ delete[] block;
+ }
+
+#ifdef MAP_HUGETLB
+ for (const auto& mmap_info : huge_blocks_) {
+ if (mmap_info.addr_ == nullptr) {
+ continue;
+ }
+ auto ret = munmap(mmap_info.addr_, mmap_info.length_);
+ if (ret != 0) {
+ // TODO(sdong): Better handling
+ }
+ }
+#endif
+}
+
+char* Arena::AllocateFallback(size_t bytes, bool aligned) {
+ if (bytes > kBlockSize / 4) {
+ ++irregular_block_num;
+ // Object is more than a quarter of our block size. Allocate it separately
+ // to avoid wasting too much space in leftover bytes.
+ return AllocateNewBlock(bytes);
+ }
+
+ // We waste the remaining space in the current block.
+ size_t size = 0;
+ char* block_head = nullptr;
+#ifdef MAP_HUGETLB
+ if (hugetlb_size_) {
+ size = hugetlb_size_;
+ block_head = AllocateFromHugePage(size);
+ }
+#endif
+ if (!block_head) {
+ size = kBlockSize;
+ block_head = AllocateNewBlock(size);
+ }
+ alloc_bytes_remaining_ = size - bytes;
+
+ if (aligned) {
+ aligned_alloc_ptr_ = block_head + bytes;
+ unaligned_alloc_ptr_ = block_head + size;
+ return block_head;
+ } else {
+ aligned_alloc_ptr_ = block_head;
+ unaligned_alloc_ptr_ = block_head + size - bytes;
+ return unaligned_alloc_ptr_;
+ }
+}
+
+char* Arena::AllocateFromHugePage(size_t bytes) {
+#ifdef MAP_HUGETLB
+ if (hugetlb_size_ == 0) {
+ return nullptr;
+ }
+ // Reserve space in `huge_blocks_` before calling `mmap`.
+ // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+ // own memory and do fewer reallocations.
+ //
+ // - If `emplace_back` throws, no memory leaks because we haven't called
+ // `mmap` yet.
+ // - If `mmap` throws, no memory leaks because the vector will be cleaned up
+ // via RAII.
+ huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */);
+
+ void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
+ (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0);
+
+ if (addr == MAP_FAILED) {
+ return nullptr;
+ }
+ huge_blocks_.back() = MmapInfo(addr, bytes);
+ blocks_memory_ += bytes;
+ if (tracker_ != nullptr) {
+ tracker_->Allocate(bytes);
+ }
+ return reinterpret_cast<char*>(addr);
+#else
+ (void)bytes;
+ return nullptr;
+#endif
+}
+
+char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
+ Logger* logger) {
+ assert((kAlignUnit & (kAlignUnit - 1)) ==
+ 0); // Pointer size should be a power of 2
+
+#ifdef MAP_HUGETLB
+ if (huge_page_size > 0 && bytes > 0) {
+ // Allocate from a huge page TBL table.
+ assert(logger != nullptr); // logger need to be passed in.
+ size_t reserved_size =
+ ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
+ assert(reserved_size >= bytes);
+
+ char* addr = AllocateFromHugePage(reserved_size);
+ if (addr == nullptr) {
+ ROCKS_LOG_WARN(logger,
+ "AllocateAligned fail to allocate huge TLB pages: %s",
+ strerror(errno));
+ // fail back to malloc
+ } else {
+ return addr;
+ }
+ }
+#else
+ (void)huge_page_size;
+ (void)logger;
+#endif
+
+ size_t current_mod =
+ reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+ size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
+ size_t needed = bytes + slop;
+ char* result;
+ if (needed <= alloc_bytes_remaining_) {
+ result = aligned_alloc_ptr_ + slop;
+ aligned_alloc_ptr_ += needed;
+ alloc_bytes_remaining_ -= needed;
+ } else {
+ // AllocateFallback always returns aligned memory
+ result = AllocateFallback(bytes, true /* aligned */);
+ }
+ assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
+ return result;
+}
+
+char* Arena::AllocateNewBlock(size_t block_bytes) {
+ // Reserve space in `blocks_` before allocating memory via new.
+ // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+ // own memory and do fewer reallocations.
+ //
+ // - If `emplace_back` throws, no memory leaks because we haven't called `new`
+ // yet.
+ // - If `new` throws, no memory leaks because the vector will be cleaned up
+ // via RAII.
+ blocks_.emplace_back(nullptr);
+
+ char* block = new char[block_bytes];
+ size_t allocated_size;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ allocated_size = malloc_usable_size(block);
+#ifndef NDEBUG
+ // It's hard to predict what malloc_usable_size() returns.
+ // A callback can allow users to change the costed size.
+ std::pair<size_t*, size_t*> pair(&allocated_size, &block_bytes);
+ TEST_SYNC_POINT_CALLBACK("Arena::AllocateNewBlock:0", &pair);
+#endif // NDEBUG
+#else
+ allocated_size = block_bytes;
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ blocks_memory_ += allocated_size;
+ if (tracker_ != nullptr) {
+ tracker_->Allocate(allocated_size);
+ }
+ blocks_.back() = block;
+ return block;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/arena.h b/src/rocksdb/memory/arena.h
new file mode 100644
index 000000000..a7ee4c6ab
--- /dev/null
+++ b/src/rocksdb/memory/arena.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Arena is an implementation of Allocator class. For a request of small size,
+// it allocates a block with pre-defined block size. For a request of big
+// size, it uses malloc to directly get the requested size.
+
+#pragma once
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
+#include <assert.h>
+#include <stdint.h>
+#include <cerrno>
+#include <cstddef>
+#include <vector>
+#include "memory/allocator.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena : public Allocator {
+ public:
+ // No copying allowed
+ Arena(const Arena&) = delete;
+ void operator=(const Arena&) = delete;
+
+ static const size_t kInlineSize = 2048;
+ static const size_t kMinBlockSize;
+ static const size_t kMaxBlockSize;
+
+ // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the
+ // supported hugepage size of the system), block allocation will try huge
+ // page TLB first. If allocation fails, will fall back to normal case.
+ explicit Arena(size_t block_size = kMinBlockSize,
+ AllocTracker* tracker = nullptr, size_t huge_page_size = 0);
+ ~Arena();
+
+ char* Allocate(size_t bytes) override;
+
+ // huge_page_size: if >0, will try to allocate from huage page TLB.
+ // The argument will be the size of the page size for huge page TLB. Bytes
+ // will be rounded up to multiple of the page size to allocate through mmap
+ // anonymous option with huge page on. The extra space allocated will be
+ // wasted. If allocation fails, will fall back to normal case. To enable it,
+ // need to reserve huge pages for it to be allocated, like:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt for details.
+ // huge page allocation can fail. In this case it will fail back to
+ // normal cases. The messages will be logged to logger. So when calling with
+ // huge_page_tlb_size > 0, we highly recommend a logger is passed in.
+ // Otherwise, the error message will be printed out to stderr directly.
+ char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+ Logger* logger = nullptr) override;
+
+ // Returns an estimate of the total memory usage of data allocated
+ // by the arena (exclude the space allocated but not yet used for future
+ // allocations).
+ size_t ApproximateMemoryUsage() const {
+ return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+ alloc_bytes_remaining_;
+ }
+
+ size_t MemoryAllocatedBytes() const { return blocks_memory_; }
+
+ size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; }
+
+ // If an allocation is too big, we'll allocate an irregular block with the
+ // same size of that allocation.
+ size_t IrregularBlockNum() const { return irregular_block_num; }
+
+ size_t BlockSize() const override { return kBlockSize; }
+
+ bool IsInInlineBlock() const {
+ return blocks_.empty();
+ }
+
+ private:
+ char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t))));
+ // Number of bytes allocated in one block
+ const size_t kBlockSize;
+ // Array of new[] allocated memory blocks
+ typedef std::vector<char*> Blocks;
+ Blocks blocks_;
+
+ struct MmapInfo {
+ void* addr_;
+ size_t length_;
+
+ MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
+ };
+ std::vector<MmapInfo> huge_blocks_;
+ size_t irregular_block_num = 0;
+
+ // Stats for current active block.
+ // For each block, we allocate aligned memory chucks from one end and
+ // allocate unaligned memory chucks from the other end. Otherwise the
+ // memory waste for alignment will be higher if we allocate both types of
+ // memory from one direction.
+ char* unaligned_alloc_ptr_ = nullptr;
+ char* aligned_alloc_ptr_ = nullptr;
+ // How many bytes left in currently active block?
+ size_t alloc_bytes_remaining_ = 0;
+
+#ifdef MAP_HUGETLB
+ size_t hugetlb_size_ = 0;
+#endif // MAP_HUGETLB
+ char* AllocateFromHugePage(size_t bytes);
+ char* AllocateFallback(size_t bytes, bool aligned);
+ char* AllocateNewBlock(size_t block_bytes);
+
+ // Bytes of memory in blocks allocated so far
+ size_t blocks_memory_ = 0;
+ AllocTracker* tracker_;
+};
+
+inline char* Arena::Allocate(size_t bytes) {
+ // The semantics of what to return are a bit messy if we allow
+ // 0-byte allocations, so we disallow them here (we don't need
+ // them for our internal use).
+ assert(bytes > 0);
+ if (bytes <= alloc_bytes_remaining_) {
+ unaligned_alloc_ptr_ -= bytes;
+ alloc_bytes_remaining_ -= bytes;
+ return unaligned_alloc_ptr_;
+ }
+ return AllocateFallback(bytes, false /* unaligned */);
+}
+
+// check and adjust the block_size so that the return value is
+// 1. in the range of [kMinBlockSize, kMaxBlockSize].
+// 2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/arena_test.cc b/src/rocksdb/memory/arena_test.cc
new file mode 100644
index 000000000..3fa6483fc
--- /dev/null
+++ b/src/rocksdb/memory/arena_test.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memory/arena.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const size_t kHugePageSize = 2 * 1024 * 1024;
+} // namespace
+class ArenaTest : public testing::Test {};
+
+TEST_F(ArenaTest, Empty) { Arena arena0; }
+
+namespace {
+bool CheckMemoryAllocated(size_t allocated, size_t expected) {
+ // The value returned by Arena::MemoryAllocatedBytes() may be greater than
+ // the requested memory. We choose a somewhat arbitrary upper bound of
+ // max_expected = expected * 1.1 to detect critical overallocation.
+ size_t max_expected = expected + expected / 10;
+ return allocated >= expected && allocated <= max_expected;
+}
+
+void MemoryAllocatedBytesTest(size_t huge_page_size) {
+ const int N = 17;
+ size_t req_sz; // requested size
+ size_t bsz = 32 * 1024; // block size
+ size_t expected_memory_allocated;
+
+ Arena arena(bsz, nullptr, huge_page_size);
+
+ // requested size > quarter of a block:
+ // allocate requested size separately
+ req_sz = 12 * 1024;
+ for (int i = 0; i < N; i++) {
+ arena.Allocate(req_sz);
+ }
+ expected_memory_allocated = req_sz * N + Arena::kInlineSize;
+ ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+ expected_memory_allocated);
+
+ arena.Allocate(Arena::kInlineSize - 1);
+
+ // requested size < quarter of a block:
+ // allocate a block with the default size, then try to use unused part
+ // of the block. So one new block will be allocated for the first
+ // Allocate(99) call. All the remaining calls won't lead to new allocation.
+ req_sz = 99;
+ for (int i = 0; i < N; i++) {
+ arena.Allocate(req_sz);
+ }
+ if (huge_page_size) {
+ ASSERT_TRUE(
+ CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+ expected_memory_allocated + bsz) ||
+ CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+ expected_memory_allocated + huge_page_size));
+ } else {
+ expected_memory_allocated += bsz;
+ ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+ expected_memory_allocated);
+ }
+
+ // requested size > size of a block:
+ // allocate requested size separately
+ expected_memory_allocated = arena.MemoryAllocatedBytes();
+ req_sz = 8 * 1024 * 1024;
+ for (int i = 0; i < N; i++) {
+ arena.Allocate(req_sz);
+ }
+ expected_memory_allocated += req_sz * N;
+ ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+ expected_memory_allocated);
+}
+
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+static void ApproximateMemoryUsageTest(size_t huge_page_size) {
+ const size_t kBlockSize = 4096;
+ const size_t kEntrySize = kBlockSize / 8;
+ const size_t kZero = 0;
+ Arena arena(kBlockSize, nullptr, huge_page_size);
+ ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
+
+ // allocate inline bytes
+ const size_t kAlignUnit = alignof(max_align_t);
+ EXPECT_TRUE(arena.IsInInlineBlock());
+ arena.AllocateAligned(kAlignUnit);
+ EXPECT_TRUE(arena.IsInInlineBlock());
+ arena.AllocateAligned(Arena::kInlineSize / 2 - (2 * kAlignUnit));
+ EXPECT_TRUE(arena.IsInInlineBlock());
+ arena.AllocateAligned(Arena::kInlineSize / 2);
+ EXPECT_TRUE(arena.IsInInlineBlock());
+ ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - kAlignUnit);
+ ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+ Arena::kInlineSize);
+
+ auto num_blocks = kBlockSize / kEntrySize;
+
+ // first allocation
+ arena.AllocateAligned(kEntrySize);
+ EXPECT_FALSE(arena.IsInInlineBlock());
+ auto mem_usage = arena.MemoryAllocatedBytes();
+ if (huge_page_size) {
+ ASSERT_TRUE(
+ CheckMemoryAllocated(mem_usage, kBlockSize + Arena::kInlineSize) ||
+ CheckMemoryAllocated(mem_usage, huge_page_size + Arena::kInlineSize));
+ } else {
+ ASSERT_PRED2(CheckMemoryAllocated, mem_usage,
+ kBlockSize + Arena::kInlineSize);
+ }
+ auto usage = arena.ApproximateMemoryUsage();
+ ASSERT_LT(usage, mem_usage);
+ for (size_t i = 1; i < num_blocks; ++i) {
+ arena.AllocateAligned(kEntrySize);
+ ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+ ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+ EXPECT_FALSE(arena.IsInInlineBlock());
+ usage = arena.ApproximateMemoryUsage();
+ }
+ if (huge_page_size) {
+ ASSERT_TRUE(usage > mem_usage ||
+ usage + huge_page_size - kBlockSize == mem_usage);
+ } else {
+ ASSERT_GT(usage, mem_usage);
+ }
+}
+
+static void SimpleTest(size_t huge_page_size) {
+ std::vector<std::pair<size_t, char*>> allocated;
+ Arena arena(Arena::kMinBlockSize, nullptr, huge_page_size);
+ const int N = 100000;
+ size_t bytes = 0;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ size_t s;
+ if (i % (N / 10) == 0) {
+ s = i;
+ } else {
+ s = rnd.OneIn(4000)
+ ? rnd.Uniform(6000)
+ : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+ }
+ if (s == 0) {
+ // Our arena disallows size 0 allocations.
+ s = 1;
+ }
+ char* r;
+ if (rnd.OneIn(10)) {
+ r = arena.AllocateAligned(s);
+ } else {
+ r = arena.Allocate(s);
+ }
+
+ for (unsigned int b = 0; b < s; b++) {
+ // Fill the "i"th allocation with a known bit pattern
+ r[b] = i % 256;
+ }
+ bytes += s;
+ allocated.push_back(std::make_pair(s, r));
+ ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
+ if (i > N / 10) {
+ ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
+ }
+ }
+ for (unsigned int i = 0; i < allocated.size(); i++) {
+ size_t num_bytes = allocated[i].first;
+ const char* p = allocated[i].second;
+ for (unsigned int b = 0; b < num_bytes; b++) {
+ // Check the "i"th allocation for the known bit pattern
+ ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256));
+ }
+ }
+}
+} // namespace
+
+TEST_F(ArenaTest, MemoryAllocatedBytes) {
+ MemoryAllocatedBytesTest(0);
+ MemoryAllocatedBytesTest(kHugePageSize);
+}
+
+TEST_F(ArenaTest, ApproximateMemoryUsage) {
+ ApproximateMemoryUsageTest(0);
+ ApproximateMemoryUsageTest(kHugePageSize);
+}
+
+TEST_F(ArenaTest, Simple) {
+ SimpleTest(0);
+ SimpleTest(kHugePageSize);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/memory/concurrent_arena.cc b/src/rocksdb/memory/concurrent_arena.cc
new file mode 100644
index 000000000..3333f94db
--- /dev/null
+++ b/src/rocksdb/memory/concurrent_arena.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memory/concurrent_arena.h"
+#include <thread>
+#include "port/port.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+__thread size_t ConcurrentArena::tls_cpuid = 0;
+#endif
+
+namespace {
+// If the shard block size is too large, in the worst case, every core
+// allocates a block without populate it. If the shared block size is
+// 1MB, 64 cores will quickly allocate 64MB, and may quickly trigger a
+// flush. Cap the size instead.
+const size_t kMaxShardBlockSize = size_t{128 * 1024};
+} // namespace
+
+ConcurrentArena::ConcurrentArena(size_t block_size, AllocTracker* tracker,
+ size_t huge_page_size)
+ : shard_block_size_(std::min(kMaxShardBlockSize, block_size / 8)),
+ shards_(),
+ arena_(block_size, tracker, huge_page_size) {
+ Fixup();
+}
+
+ConcurrentArena::Shard* ConcurrentArena::Repick() {
+ auto shard_and_index = shards_.AccessElementAndIndex();
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+ // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we
+ // have repicked
+ tls_cpuid = shard_and_index.second | shards_.Size();
+#endif
+ return shard_and_index.first;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/concurrent_arena.h b/src/rocksdb/memory/concurrent_arena.h
new file mode 100644
index 000000000..dad27a307
--- /dev/null
+++ b/src/rocksdb/memory/concurrent_arena.h
@@ -0,0 +1,215 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <memory>
+#include <utility>
+#include "memory/allocator.h"
+#include "memory/arena.h"
+#include "port/likely.h"
+#include "util/core_local.h"
+#include "util/mutexlock.h"
+#include "util/thread_local.h"
+
+// Only generate field unused warning for padding array, or build under
+// GCC 4.8.1 will fail.
+#ifdef __clang__
+#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__))
+#else
+#define ROCKSDB_FIELD_UNUSED
+#endif // __clang__
+
+namespace ROCKSDB_NAMESPACE {
+
+class Logger;
+
+// ConcurrentArena wraps an Arena. It makes it thread safe using a fast
+// inlined spinlock, and adds small per-core allocation caches to avoid
+// contention for small allocations. To avoid any memory waste from the
+// per-core shards, they are kept small, they are lazily instantiated
+// only if ConcurrentArena actually notices concurrent use, and they
+// adjust their size so that there is no fragmentation waste when the
+// shard blocks are allocated from the underlying main arena.
+class ConcurrentArena : public Allocator {
+ public:
+ // block_size and huge_page_size are the same as for Arena (and are
+ // in fact just passed to the constructor of arena_. The core-local
+ // shards compute their shard_block_size as a fraction of block_size
+ // that varies according to the hardware concurrency level.
+ explicit ConcurrentArena(size_t block_size = Arena::kMinBlockSize,
+ AllocTracker* tracker = nullptr,
+ size_t huge_page_size = 0);
+
+ char* Allocate(size_t bytes) override {
+ return AllocateImpl(bytes, false /*force_arena*/,
+ [=]() { return arena_.Allocate(bytes); });
+ }
+
+ char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+ Logger* logger = nullptr) override {
+ size_t rounded_up = ((bytes - 1) | (sizeof(void*) - 1)) + 1;
+ assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) &&
+ (rounded_up % sizeof(void*)) == 0);
+
+ return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, [=]() {
+ return arena_.AllocateAligned(rounded_up, huge_page_size, logger);
+ });
+ }
+
+ size_t ApproximateMemoryUsage() const {
+ std::unique_lock<SpinMutex> lock(arena_mutex_, std::defer_lock);
+ lock.lock();
+ return arena_.ApproximateMemoryUsage() - ShardAllocatedAndUnused();
+ }
+
+ size_t MemoryAllocatedBytes() const {
+ return memory_allocated_bytes_.load(std::memory_order_relaxed);
+ }
+
+ size_t AllocatedAndUnused() const {
+ return arena_allocated_and_unused_.load(std::memory_order_relaxed) +
+ ShardAllocatedAndUnused();
+ }
+
+ size_t IrregularBlockNum() const {
+ return irregular_block_num_.load(std::memory_order_relaxed);
+ }
+
+ size_t BlockSize() const override { return arena_.BlockSize(); }
+
+ private:
+ struct Shard {
+ char padding[40] ROCKSDB_FIELD_UNUSED;
+ mutable SpinMutex mutex;
+ char* free_begin_;
+ std::atomic<size_t> allocated_and_unused_;
+
+ Shard() : free_begin_(nullptr), allocated_and_unused_(0) {}
+ };
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+ static __thread size_t tls_cpuid;
+#else
+ enum ZeroFirstEnum : size_t { tls_cpuid = 0 };
+#endif
+
+ char padding0[56] ROCKSDB_FIELD_UNUSED;
+
+ size_t shard_block_size_;
+
+ CoreLocalArray<Shard> shards_;
+
+ Arena arena_;
+ mutable SpinMutex arena_mutex_;
+ std::atomic<size_t> arena_allocated_and_unused_;
+ std::atomic<size_t> memory_allocated_bytes_;
+ std::atomic<size_t> irregular_block_num_;
+
+ char padding1[56] ROCKSDB_FIELD_UNUSED;
+
+ Shard* Repick();
+
+ size_t ShardAllocatedAndUnused() const {
+ size_t total = 0;
+ for (size_t i = 0; i < shards_.Size(); ++i) {
+ total += shards_.AccessAtCore(i)->allocated_and_unused_.load(
+ std::memory_order_relaxed);
+ }
+ return total;
+ }
+
+ template <typename Func>
+ char* AllocateImpl(size_t bytes, bool force_arena, const Func& func) {
+ size_t cpu;
+
+ // Go directly to the arena if the allocation is too large, or if
+ // we've never needed to Repick() and the arena mutex is available
+ // with no waiting. This keeps the fragmentation penalty of
+ // concurrency zero unless it might actually confer an advantage.
+ std::unique_lock<SpinMutex> arena_lock(arena_mutex_, std::defer_lock);
+ if (bytes > shard_block_size_ / 4 || force_arena ||
+ ((cpu = tls_cpuid) == 0 &&
+ !shards_.AccessAtCore(0)->allocated_and_unused_.load(
+ std::memory_order_relaxed) &&
+ arena_lock.try_lock())) {
+ if (!arena_lock.owns_lock()) {
+ arena_lock.lock();
+ }
+ auto rv = func();
+ Fixup();
+ return rv;
+ }
+
+ // pick a shard from which to allocate
+ Shard* s = shards_.AccessAtCore(cpu & (shards_.Size() - 1));
+ if (!s->mutex.try_lock()) {
+ s = Repick();
+ s->mutex.lock();
+ }
+ std::unique_lock<SpinMutex> lock(s->mutex, std::adopt_lock);
+
+ size_t avail = s->allocated_and_unused_.load(std::memory_order_relaxed);
+ if (avail < bytes) {
+ // reload
+ std::lock_guard<SpinMutex> reload_lock(arena_mutex_);
+
+ // If the arena's current block is within a factor of 2 of the right
+ // size, we adjust our request to avoid arena waste.
+ auto exact = arena_allocated_and_unused_.load(std::memory_order_relaxed);
+ assert(exact == arena_.AllocatedAndUnused());
+
+ if (exact >= bytes && arena_.IsInInlineBlock()) {
+ // If we haven't exhausted arena's inline block yet, allocate from arena
+ // directly. This ensures that we'll do the first few small allocations
+ // without allocating any blocks.
+ // In particular this prevents empty memtables from using
+ // disproportionately large amount of memory: a memtable allocates on
+ // the order of 1 KB of memory when created; we wouldn't want to
+ // allocate a full arena block (typically a few megabytes) for that,
+ // especially if there are thousands of empty memtables.
+ auto rv = func();
+ Fixup();
+ return rv;
+ }
+
+ avail = exact >= shard_block_size_ / 2 && exact < shard_block_size_ * 2
+ ? exact
+ : shard_block_size_;
+ s->free_begin_ = arena_.AllocateAligned(avail);
+ Fixup();
+ }
+ s->allocated_and_unused_.store(avail - bytes, std::memory_order_relaxed);
+
+ char* rv;
+ if ((bytes % sizeof(void*)) == 0) {
+ // aligned allocation from the beginning
+ rv = s->free_begin_;
+ s->free_begin_ += bytes;
+ } else {
+ // unaligned from the end
+ rv = s->free_begin_ + avail - bytes;
+ }
+ return rv;
+ }
+
+ void Fixup() {
+ arena_allocated_and_unused_.store(arena_.AllocatedAndUnused(),
+ std::memory_order_relaxed);
+ memory_allocated_bytes_.store(arena_.MemoryAllocatedBytes(),
+ std::memory_order_relaxed);
+ irregular_block_num_.store(arena_.IrregularBlockNum(),
+ std::memory_order_relaxed);
+ }
+
+ ConcurrentArena(const ConcurrentArena&) = delete;
+ ConcurrentArena& operator=(const ConcurrentArena&) = delete;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/jemalloc_nodump_allocator.cc b/src/rocksdb/memory/jemalloc_nodump_allocator.cc
new file mode 100644
index 000000000..980b08b95
--- /dev/null
+++ b/src/rocksdb/memory/jemalloc_nodump_allocator.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "memory/jemalloc_nodump_allocator.h"
+
+#include <string>
+#include <thread>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
+
+JemallocNodumpAllocator::JemallocNodumpAllocator(
+ JemallocAllocatorOptions& options,
+ std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index)
+ : options_(options),
+ arena_hooks_(std::move(arena_hooks)),
+ arena_index_(arena_index),
+ tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {}
+
+int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
+ // We always enable tcache. The only corner case is when there are a ton of
+ // threads accessing with low frequency, then it could consume a lot of
+ // memory (may reach # threads * ~1MB) without bringing too much benefit.
+ if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
+ size > options_.tcache_size_upper_bound)) {
+ return MALLOCX_TCACHE_NONE;
+ }
+ unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
+ if (UNLIKELY(tcache_index == nullptr)) {
+ // Instantiate tcache.
+ tcache_index = new unsigned(0);
+ size_t tcache_index_size = sizeof(unsigned);
+ int ret =
+ mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0);
+ if (ret != 0) {
+ // No good way to expose the error. Silently disable tcache.
+ delete tcache_index;
+ return MALLOCX_TCACHE_NONE;
+ }
+ tcache_.Reset(static_cast<void*>(tcache_index));
+ }
+ return MALLOCX_TCACHE(*tcache_index);
+}
+
+void* JemallocNodumpAllocator::Allocate(size_t size) {
+ int tcache_flag = GetThreadSpecificCache(size);
+ return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
+}
+
+void JemallocNodumpAllocator::Deallocate(void* p) {
+ // Obtain tcache.
+ size_t size = 0;
+ if (options_.limit_tcache_size) {
+ size = malloc_usable_size(p);
+ }
+ int tcache_flag = GetThreadSpecificCache(size);
+ // No need to pass arena index to dallocx(). Jemalloc will find arena index
+ // from its own metadata.
+ dallocx(p, tcache_flag);
+}
+
+void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
+ size_t size, size_t alignment, bool* zero,
+ bool* commit, unsigned arena_ind) {
+ extent_alloc_t* original_alloc =
+ original_alloc_.load(std::memory_order_relaxed);
+ assert(original_alloc != nullptr);
+ void* result = original_alloc(extent, new_addr, size, alignment, zero, commit,
+ arena_ind);
+ if (result != nullptr) {
+ int ret = madvise(result, size, MADV_DONTDUMP);
+ if (ret != 0) {
+ fprintf(
+ stderr,
+ "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d",
+ ret);
+ assert(false);
+ }
+ }
+ return result;
+}
+
+Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
+ assert(arena_index != 0);
+ std::string key = "arena." + ToString(arena_index) + ".destroy";
+ int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
+ if (ret != 0) {
+ return Status::Incomplete("Failed to destroy jemalloc arena, error code: " +
+ ToString(ret));
+ }
+ return Status::OK();
+}
+
+void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) {
+ assert(ptr != nullptr);
+ unsigned* tcache_index = static_cast<unsigned*>(ptr);
+ size_t tcache_index_size = sizeof(unsigned);
+ int ret __attribute__((__unused__)) =
+ mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size);
+ // Silently ignore error.
+ assert(ret == 0);
+ delete tcache_index;
+}
+
+JemallocNodumpAllocator::~JemallocNodumpAllocator() {
+ // Destroy tcache before destroying arena.
+ autovector<void*> tcache_list;
+ tcache_.Scrape(&tcache_list, nullptr);
+ for (void* tcache_index : tcache_list) {
+ DestroyThreadSpecificCache(tcache_index);
+ }
+ // Destroy arena. Silently ignore error.
+ Status s __attribute__((__unused__)) = DestroyArena(arena_index_);
+ assert(s.ok());
+}
+
+size_t JemallocNodumpAllocator::UsableSize(void* p,
+ size_t /*allocation_size*/) const {
+ return malloc_usable_size(static_cast<void*>(p));
+}
+#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+Status NewJemallocNodumpAllocator(
+ JemallocAllocatorOptions& options,
+ std::shared_ptr<MemoryAllocator>* memory_allocator) {
+ *memory_allocator = nullptr;
+ Status unsupported = Status::NotSupported(
+ "JemallocNodumpAllocator only available with jemalloc version >= 5 "
+ "and MADV_DONTDUMP is available.");
+#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+ (void)options;
+ return unsupported;
+#else
+ if (!HasJemalloc()) {
+ return unsupported;
+ }
+ if (memory_allocator == nullptr) {
+ return Status::InvalidArgument("memory_allocator must be non-null.");
+ }
+ if (options.limit_tcache_size &&
+ options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
+ return Status::InvalidArgument(
+ "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+ }
+
+ // Create arena.
+ unsigned arena_index = 0;
+ size_t arena_index_size = sizeof(arena_index);
+ int ret =
+ mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
+ if (ret != 0) {
+ return Status::Incomplete("Failed to create jemalloc arena, error code: " +
+ ToString(ret));
+ }
+ assert(arena_index != 0);
+
+ // Read existing hooks.
+ std::string key = "arena." + ToString(arena_index) + ".extent_hooks";
+ extent_hooks_t* hooks;
+ size_t hooks_size = sizeof(hooks);
+ ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
+ if (ret != 0) {
+ JemallocNodumpAllocator::DestroyArena(arena_index);
+ return Status::Incomplete("Failed to read existing hooks, error code: " +
+ ToString(ret));
+ }
+
+ // Store existing alloc.
+ extent_alloc_t* original_alloc = hooks->alloc;
+ extent_alloc_t* expected = nullptr;
+ bool success =
+ JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
+ expected, original_alloc);
+ if (!success && original_alloc != expected) {
+ JemallocNodumpAllocator::DestroyArena(arena_index);
+ return Status::Incomplete("Original alloc conflict.");
+ }
+
+ // Set the custom hook.
+ std::unique_ptr<extent_hooks_t> new_hooks(new extent_hooks_t(*hooks));
+ new_hooks->alloc = &JemallocNodumpAllocator::Alloc;
+ extent_hooks_t* hooks_ptr = new_hooks.get();
+ ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
+ if (ret != 0) {
+ JemallocNodumpAllocator::DestroyArena(arena_index);
+ return Status::Incomplete("Failed to set custom hook, error code: " +
+ ToString(ret));
+ }
+
+ // Create cache allocator.
+ memory_allocator->reset(
+ new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index));
+ return Status::OK();
+#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/jemalloc_nodump_allocator.h b/src/rocksdb/memory/jemalloc_nodump_allocator.h
new file mode 100644
index 000000000..7bc255508
--- /dev/null
+++ b/src/rocksdb/memory/jemalloc_nodump_allocator.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <vector>
+
+#include "port/jemalloc_helper.h"
+#include "port/port.h"
+#include "rocksdb/memory_allocator.h"
+#include "util/thread_local.h"
+
+#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
+
+#include <sys/mman.h>
+
+#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
+#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+namespace ROCKSDB_NAMESPACE {
+
+class JemallocNodumpAllocator : public MemoryAllocator {
+ public:
+ JemallocNodumpAllocator(JemallocAllocatorOptions& options,
+ std::unique_ptr<extent_hooks_t>&& arena_hooks,
+ unsigned arena_index);
+ ~JemallocNodumpAllocator();
+
+ const char* Name() const override { return "JemallocNodumpAllocator"; }
+ void* Allocate(size_t size) override;
+ void Deallocate(void* p) override;
+ size_t UsableSize(void* p, size_t allocation_size) const override;
+
+ private:
+ friend Status NewJemallocNodumpAllocator(
+ JemallocAllocatorOptions& options,
+ std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+ // Custom alloc hook to replace jemalloc default alloc.
+ static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
+ size_t alignment, bool* zero, bool* commit,
+ unsigned arena_ind);
+
+ // Destroy arena on destruction of the allocator, or on failure.
+ static Status DestroyArena(unsigned arena_index);
+
+ // Destroy tcache on destruction of the allocator, or thread exit.
+ static void DestroyThreadSpecificCache(void* ptr);
+
+ // Get or create tcache. Return flag suitable to use with `mallocx`:
+ // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
+ int GetThreadSpecificCache(size_t size);
+
+ // A function pointer to jemalloc default alloc. Use atomic to make sure
+ // NewJemallocNodumpAllocator is thread-safe.
+ //
+ // Hack: original_alloc_ needs to be static for Alloc() to access it.
+ // alloc needs to be static to pass to jemalloc as function pointer.
+ static std::atomic<extent_alloc_t*> original_alloc_;
+
+ const JemallocAllocatorOptions options_;
+
+ // Custom hooks has to outlive corresponding arena.
+ const std::unique_ptr<extent_hooks_t> arena_hooks_;
+
+ // Arena index.
+ const unsigned arena_index_;
+
+ // Hold thread-local tcache index.
+ ThreadLocalPtr tcache_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
+#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
diff --git a/src/rocksdb/memory/memory_allocator.h b/src/rocksdb/memory/memory_allocator.h
new file mode 100644
index 000000000..f1a548659
--- /dev/null
+++ b/src/rocksdb/memory/memory_allocator.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "rocksdb/memory_allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct CustomDeleter {
+ CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
+
+ void operator()(char* ptr) const {
+ if (allocator) {
+ allocator->Deallocate(reinterpret_cast<void*>(ptr));
+ } else {
+ delete[] ptr;
+ }
+ }
+
+ MemoryAllocator* allocator;
+};
+
+using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
+
+inline CacheAllocationPtr AllocateBlock(size_t size,
+ MemoryAllocator* allocator) {
+ if (allocator) {
+ auto block = reinterpret_cast<char*>(allocator->Allocate(size));
+ return CacheAllocationPtr(block, allocator);
+ }
+ return CacheAllocationPtr(new char[size]);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/memory_usage.h b/src/rocksdb/memory/memory_usage.h
new file mode 100644
index 000000000..15e8b87cd
--- /dev/null
+++ b/src/rocksdb/memory/memory_usage.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <unordered_map>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper methods to estimate memroy usage by std containers.
+
+template <class Key, class Value, class Hash>
+size_t ApproximateMemoryUsage(
+ const std::unordered_map<Key, Value, Hash>& umap) {
+ typedef std::unordered_map<Key, Value, Hash> Map;
+ return sizeof(umap) +
+ // Size of all items plus a next pointer for each item.
+ (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() +
+ // Size of hash buckets.
+ umap.bucket_count() * sizeof(void*);
+}
+
+} // namespace ROCKSDB_NAMESPACE