Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/rocksdb/util
parent: Initial commit. (diff)
download: ceph-upstream.tar.xz
ceph-upstream.zip
121 files changed, 25997 insertions, 0 deletions
diff --git a/src/rocksdb/util/aligned_buffer.h b/src/rocksdb/util/aligned_buffer.h
new file mode 100644
index 00000000..2201b487
--- /dev/null
+++ b/src/rocksdb/util/aligned_buffer.h
@@ -0,0 +1,196 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <algorithm>
+#include "port/port.h"
+
+namespace rocksdb {
+
+inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
+  s -= (s & (page_size - 1));
+  assert((s % page_size) == 0);
+  return s;
+}
+
+inline size_t Roundup(size_t x, size_t y) {
+  return ((x + y - 1) / y) * y;
+}
+
+inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; }
+
+// This class is to manage an aligned user
+// allocated buffer for direct I/O purposes
+// though can be used for any purpose.
+class AlignedBuffer {
+  size_t alignment_;
+  std::unique_ptr<char[]> buf_;
+  size_t capacity_;
+  size_t cursize_;
+  char* bufstart_;
+
+public:
+  AlignedBuffer()
+    : alignment_(),
+      capacity_(0),
+      cursize_(0),
+      bufstart_(nullptr) {
+  }
+
+  AlignedBuffer(AlignedBuffer&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  AlignedBuffer& operator=(AlignedBuffer&& o) ROCKSDB_NOEXCEPT {
+    alignment_ = std::move(o.alignment_);
+    buf_ = std::move(o.buf_);
+    capacity_ = std::move(o.capacity_);
+    cursize_ = std::move(o.cursize_);
+    bufstart_ = std::move(o.bufstart_);
+    return *this;
+  }
+
+  AlignedBuffer(const AlignedBuffer&) = delete;
+
+  AlignedBuffer& operator=(const AlignedBuffer&) = delete;
+
+  static bool isAligned(const void* ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+  }
+
+  static bool isAligned(size_t n, size_t alignment) {
+    return n % alignment == 0;
+  }
+
+  size_t Alignment() const {
+    return alignment_;
+  }
+
+  size_t Capacity() const {
+    return capacity_;
+  }
+
+  size_t CurrentSize() const {
+    return cursize_;
+  }
+
+  const char* BufferStart() const {
+    return bufstart_;
+  }
+
+  char* BufferStart() { return bufstart_; }
+
+  void Clear() {
+    cursize_ = 0;
+  }
+
+  void Alignment(size_t alignment) {
+    assert(alignment > 0);
+    assert((alignment & (alignment - 1)) == 0);
+    alignment_ = alignment;
+  }
+
+  // Allocates a new buffer and sets bufstart_ to the aligned first byte.
+  // requested_capacity: requested new buffer capacity. This capacity will be
+  //     rounded up based on alignment.
+  // copy_data: Copy data from old buffer to new buffer.
+  // copy_offset: Copy data from this offset in old buffer.
+  // copy_len: Number of bytes to copy.
+  void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
+                         uint64_t copy_offset = 0, size_t copy_len = 0) {
+    assert(alignment_ > 0);
+    assert((alignment_ & (alignment_ - 1)) == 0);
+
+    copy_len = copy_len > 0 ? copy_len : cursize_;
+    if (copy_data && requested_capacity < copy_len) {
+      // If we are downsizing to a capacity that is smaller than the current
+      // data in the buffer. Ignore the request.
+      return;
+    }
+
+    size_t new_capacity = Roundup(requested_capacity, alignment_);
+    char* new_buf = new char[new_capacity + alignment_];
+    char* new_bufstart = reinterpret_cast<char*>(
+        (reinterpret_cast<uintptr_t>(new_buf) + (alignment_ - 1)) &
+        ~static_cast<uintptr_t>(alignment_ - 1));
+
+    if (copy_data) {
+      assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_);
+      memcpy(new_bufstart, bufstart_ + copy_offset, copy_len);
+      cursize_ = copy_len;
+    } else {
+      cursize_ = 0;
+    }
+
+    bufstart_ = new_bufstart;
+    capacity_ = new_capacity;
+    buf_.reset(new_buf);
+  }
+  // Used for write
+  // Returns the number of bytes appended
+  size_t Append(const char* src, size_t append_size) {
+    size_t buffer_remaining = capacity_ - cursize_;
+    size_t to_copy = std::min(append_size, buffer_remaining);
+
+    if (to_copy > 0) {
+      memcpy(bufstart_ + cursize_, src, to_copy);
+      cursize_ += to_copy;
+    }
+    return to_copy;
+  }
+
+  size_t Read(char* dest, size_t offset, size_t read_size) const {
+    assert(offset < cursize_);
+
+    size_t to_read = 0;
+    if(offset < cursize_) {
+      to_read = std::min(cursize_ - offset, read_size);
+    }
+    if (to_read > 0) {
+      memcpy(dest, bufstart_ + offset, to_read);
+    }
+    return to_read;
+  }
+
+  /// Pad to alignment
+  void PadToAlignmentWith(int padding) {
+    size_t total_size = Roundup(cursize_, alignment_);
+    size_t pad_size = total_size - cursize_;
+
+    if (pad_size > 0) {
+      assert((pad_size + cursize_) <= capacity_);
+      memset(bufstart_ + cursize_, padding, pad_size);
+      cursize_ += pad_size;
+    }
+  }
+
+  void PadWith(size_t pad_size, int padding) {
+    assert((pad_size + cursize_) <= capacity_);
+    memset(bufstart_ + cursize_, padding, pad_size);
+    cursize_ += pad_size;
+  }
+
+  // After a partial flush move the tail to the beginning of the buffer
+  void RefitTail(size_t tail_offset, size_t tail_size) {
+    if (tail_size > 0) {
+      memmove(bufstart_, bufstart_ + tail_offset, tail_size);
+    }
+    cursize_ = tail_size;
+  }
+
+  // Returns place to start writing
+  char* Destination() {
+    return bufstart_ + cursize_;
+  }
+
+  void Size(size_t cursize) {
+    cursize_ = cursize;
+  }
+};
+}
diff --git a/src/rocksdb/util/allocator.h b/src/rocksdb/util/allocator.h
new file mode 100644
index 00000000..505d6ba2
--- /dev/null
+++ b/src/rocksdb/util/allocator.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Abstract interface for allocating memory in blocks. This memory is freed
+// when the allocator object is destroyed. See the Arena class for more info.
+
+#pragma once
+#include <cerrno>
+#include <cstddef>
+#include "rocksdb/write_buffer_manager.h"
+
+namespace rocksdb {
+
+class Logger;
+
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+
+  virtual char* Allocate(size_t bytes) = 0;
+  virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                                Logger* logger = nullptr) = 0;
+
+  virtual size_t BlockSize() const = 0;
+};
+
+class AllocTracker {
+ public:
+  explicit AllocTracker(WriteBufferManager* write_buffer_manager);
+  ~AllocTracker();
+  void Allocate(size_t bytes);
+  // Call when we're finished allocating memory so we can free it from
+  // the write buffer's limit.
+  void DoneAllocating();
+
+  void FreeMem();
+
+  bool is_freed() const { return write_buffer_manager_ == nullptr || freed_; }
+
+ private:
+  WriteBufferManager* write_buffer_manager_;
+  std::atomic<size_t> bytes_allocated_;
+  bool done_allocating_;
+  bool freed_;
+
+  // No copying allowed
+  AllocTracker(const AllocTracker&);
+  void operator=(const AllocTracker&);
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/arena.cc b/src/rocksdb/util/arena.cc
new file mode 100644
index 00000000..d7799eb2
--- /dev/null
+++ b/src/rocksdb/util/arena.cc
@@ -0,0 +1,239 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena.h"
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif
+#endif
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
+#include <algorithm>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/logging.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+// MSVC complains that it is already defined since it is static in the header.
+#ifndef _MSC_VER
+const size_t Arena::kInlineSize;
+#endif
+
+const size_t Arena::kMinBlockSize = 4096;
+const size_t Arena::kMaxBlockSize = 2u << 30;
+static const int kAlignUnit = alignof(max_align_t);
+
+size_t OptimizeBlockSize(size_t block_size) {
+  // Make sure block_size is in optimal range
+  block_size = std::max(Arena::kMinBlockSize, block_size);
+  block_size = std::min(Arena::kMaxBlockSize, block_size);
+
+  // make sure block_size is the multiple of kAlignUnit
+  if (block_size % kAlignUnit != 0) {
+    block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
+  }
+
+  return block_size;
+}
+
+Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size)
+    : kBlockSize(OptimizeBlockSize(block_size)), tracker_(tracker) {
+  assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+         kBlockSize % kAlignUnit == 0);
+  TEST_SYNC_POINT_CALLBACK("Arena::Arena:0", const_cast<size_t*>(&kBlockSize));
+  alloc_bytes_remaining_ = sizeof(inline_block_);
+  blocks_memory_ += alloc_bytes_remaining_;
+  aligned_alloc_ptr_ = inline_block_;
+  unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_;
+#ifdef MAP_HUGETLB
+  hugetlb_size_ = huge_page_size;
+  if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
+    hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
+  }
+#else
+  (void)huge_page_size;
+#endif
+  if (tracker_ != nullptr) {
+    tracker_->Allocate(kInlineSize);
+  }
+}
+
+Arena::~Arena() {
+  if (tracker_ != nullptr) {
+    assert(tracker_->is_freed());
+    tracker_->FreeMem();
+  }
+  for (const auto& block : blocks_) {
+    delete[] block;
+  }
+
+#ifdef MAP_HUGETLB
+  for (const auto& mmap_info : huge_blocks_) {
+    if (mmap_info.addr_ == nullptr) {
+      continue;
+    }
+    auto ret = munmap(mmap_info.addr_, mmap_info.length_);
+    if (ret != 0) {
+      // TODO(sdong): Better handling
+    }
+  }
+#endif
+}
+
+char* Arena::AllocateFallback(size_t bytes, bool aligned) {
+  if (bytes > kBlockSize / 4) {
+    ++irregular_block_num;
+    // Object is more than a quarter of our block size.  Allocate it separately
+    // to avoid wasting too much space in leftover bytes.
+    return AllocateNewBlock(bytes);
+  }
+
+  // We waste the remaining space in the current block.
+  size_t size = 0;
+  char* block_head = nullptr;
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_) {
+    size = hugetlb_size_;
+    block_head = AllocateFromHugePage(size);
+  }
+#endif
+  if (!block_head) {
+    size = kBlockSize;
+    block_head = AllocateNewBlock(size);
+  }
+  alloc_bytes_remaining_ = size - bytes;
+
+  if (aligned) {
+    aligned_alloc_ptr_ = block_head + bytes;
+    unaligned_alloc_ptr_ = block_head + size;
+    return block_head;
+  } else {
+    aligned_alloc_ptr_ = block_head;
+    unaligned_alloc_ptr_ = block_head + size - bytes;
+    return unaligned_alloc_ptr_;
+  }
+}
+
+char* Arena::AllocateFromHugePage(size_t bytes) {
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_ == 0) {
+    return nullptr;
+  }
+  // Reserve space in `huge_blocks_` before calling `mmap`.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called
+  //   `mmap` yet.
+  // - If `mmap` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */);
+
+  void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
+                    (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0);
+
+  if (addr == MAP_FAILED) {
+    return nullptr;
+  }
+  huge_blocks_.back() = MmapInfo(addr, bytes);
+  blocks_memory_ += bytes;
+  if (tracker_ != nullptr) {
+    tracker_->Allocate(bytes);
+  }
+  return reinterpret_cast<char*>(addr);
+#else
+  (void)bytes;
+  return nullptr;
+#endif
+}
+
+char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
+                             Logger* logger) {
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+
+#ifdef MAP_HUGETLB
+  if (huge_page_size > 0 && bytes > 0) {
+    // Allocate from a huge page TBL table.
+    assert(logger != nullptr);  // logger need to be passed in.
+    size_t reserved_size =
+        ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
+    assert(reserved_size >= bytes);
+
+    char* addr = AllocateFromHugePage(reserved_size);
+    if (addr == nullptr) {
+      ROCKS_LOG_WARN(logger,
+                     "AllocateAligned fail to allocate huge TLB pages: %s",
+                     strerror(errno));
+      // fail back to malloc
+    } else {
+      return addr;
+    }
+  }
+#else
+  (void)huge_page_size;
+  (void)logger;
+#endif
+
+  size_t current_mod =
+      reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+  size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
+  size_t needed = bytes + slop;
+  char* result;
+  if (needed <= alloc_bytes_remaining_) {
+    result = aligned_alloc_ptr_ + slop;
+    aligned_alloc_ptr_ += needed;
+    alloc_bytes_remaining_ -= needed;
+  } else {
+    // AllocateFallback always returns aligned memory
+    result = AllocateFallback(bytes, true /* aligned */);
+  }
+  assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
+  return result;
+}
+
+char* Arena::AllocateNewBlock(size_t block_bytes) {
+  // Reserve space in `blocks_` before allocating memory via new.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called `new`
+  //   yet.
+  // - If `new` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  blocks_.emplace_back(nullptr);
+
+  char* block = new char[block_bytes];
+  size_t allocated_size;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  allocated_size = malloc_usable_size(block);
+#ifndef NDEBUG
+  // It's hard to predict what malloc_usable_size() returns.
+  // A callback can allow users to change the costed size.
+  std::pair<size_t*, size_t*> pair(&allocated_size, &block_bytes);
+  TEST_SYNC_POINT_CALLBACK("Arena::AllocateNewBlock:0", &pair);
+#endif  // NDEBUG
+#else
+  allocated_size = block_bytes;
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  blocks_memory_ += allocated_size;
+  if (tracker_ != nullptr) {
+    tracker_->Allocate(allocated_size);
+  }
+  blocks_.back() = block;
+  return block;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/arena.h b/src/rocksdb/util/arena.h
new file mode 100644
index 00000000..dc64154c
--- /dev/null
+++ b/src/rocksdb/util/arena.h
@@ -0,0 +1,141 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Arena is an implementation of Allocator class. For a request of small size,
+// it allocates a block with pre-defined block size. For a request of big
+// size, it uses malloc to directly get the requested size.
+
+#pragma once
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
+#include <cstddef>
+#include <cerrno>
+#include <vector>
+#include <assert.h>
+#include <stdint.h>
+#include "util/allocator.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class Arena : public Allocator {
+ public:
+  // No copying allowed
+  Arena(const Arena&) = delete;
+  void operator=(const Arena&) = delete;
+
+  static const size_t kInlineSize = 2048;
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
+
+  // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the
+  // supported hugepage size of the system), block allocation will try huge
+  // page TLB first. If allocation fails, will fall back to normal case.
+  explicit Arena(size_t block_size = kMinBlockSize,
+                 AllocTracker* tracker = nullptr, size_t huge_page_size = 0);
+  ~Arena();
+
+  char* Allocate(size_t bytes) override;
+
+  // huge_page_size: if >0, will try to allocate from huage page TLB.
+  // The argument will be the size of the page size for huge page TLB. Bytes
+  // will be rounded up to multiple of the page size to allocate through mmap
+  // anonymous option with huge page on. The extra  space allocated will be
+  // wasted. If allocation fails, will fall back to normal case. To enable it,
+  // need to reserve huge pages for it to be allocated, like:
+  //     sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt for details.
+  // huge page allocation can fail. In this case it will fail back to
+  // normal cases. The messages will be logged to logger. So when calling with
+  // huge_page_tlb_size > 0, we highly recommend a logger is passed in.
+  // Otherwise, the error message will be printed out to stderr directly.
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override;
+
+  // Returns an estimate of the total memory usage of data allocated
+  // by the arena (exclude the space allocated but not yet used for future
+  // allocations).
+  size_t ApproximateMemoryUsage() const {
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+           alloc_bytes_remaining_;
+  }
+
+  size_t MemoryAllocatedBytes() const { return blocks_memory_; }
+
+  size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; }
+
+  // If an allocation is too big, we'll allocate an irregular block with the
+  // same size of that allocation.
+  size_t IrregularBlockNum() const { return irregular_block_num; }
+
+  size_t BlockSize() const override { return kBlockSize; }
+
+  bool IsInInlineBlock() const {
+    return blocks_.empty();
+  }
+
+ private:
+  char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t))));
+  // Number of bytes allocated in one block
+  const size_t kBlockSize;
+  // Array of new[] allocated memory blocks
+  typedef std::vector<char*> Blocks;
+  Blocks blocks_;
+
+  struct MmapInfo {
+    void* addr_;
+    size_t length_;
+
+    MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
+  };
+  std::vector<MmapInfo> huge_blocks_;
+  size_t irregular_block_num = 0;
+
+  // Stats for current active block.
+  // For each block, we allocate aligned memory chucks from one end and
+  // allocate unaligned memory chucks from the other end. Otherwise the
+  // memory waste for alignment will be higher if we allocate both types of
+  // memory from one direction.
+  char* unaligned_alloc_ptr_ = nullptr;
+  char* aligned_alloc_ptr_ = nullptr;
+  // How many bytes left in currently active block?
+  size_t alloc_bytes_remaining_ = 0;
+
+#ifdef MAP_HUGETLB
+  size_t hugetlb_size_ = 0;
+#endif  // MAP_HUGETLB
+  char* AllocateFromHugePage(size_t bytes);
+  char* AllocateFallback(size_t bytes, bool aligned);
+  char* AllocateNewBlock(size_t block_bytes);
+
+  // Bytes of memory in blocks allocated so far
+  size_t blocks_memory_ = 0;
+  AllocTracker* tracker_;
+};
+
+inline char* Arena::Allocate(size_t bytes) {
+  // The semantics of what to return are a bit messy if we allow
+  // 0-byte allocations, so we disallow them here (we don't need
+  // them for our internal use).
+  assert(bytes > 0);
+  if (bytes <= alloc_bytes_remaining_) {
+    unaligned_alloc_ptr_ -= bytes;
+    alloc_bytes_remaining_ -= bytes;
+    return unaligned_alloc_ptr_;
+  }
+  return AllocateFallback(bytes, false /* unaligned */);
+}
+
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/arena_test.cc b/src/rocksdb/util/arena_test.cc
new file mode 100644
index 00000000..9dfc28ab
--- /dev/null
+++ b/src/rocksdb/util/arena_test.cc
@@ -0,0 +1,204 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+namespace {
+const size_t kHugePageSize = 2 * 1024 * 1024;
+}  // namespace
+class ArenaTest : public testing::Test {};
+
+TEST_F(ArenaTest, Empty) { Arena arena0; }
+
+namespace {
+bool CheckMemoryAllocated(size_t allocated, size_t expected) {
+  // The value returned by Arena::MemoryAllocatedBytes() may be greater than
+  // the requested memory. We choose a somewhat arbitrary upper bound of
+  // max_expected = expected * 1.1 to detect critical overallocation.
+  size_t max_expected = expected + expected / 10;
+  return allocated >= expected && allocated <= max_expected;
+}
+
+void MemoryAllocatedBytesTest(size_t huge_page_size) {
+  const int N = 17;
+  size_t req_sz;  // requested size
+  size_t bsz = 32 * 1024;  // block size
+  size_t expected_memory_allocated;
+
+  Arena arena(bsz, nullptr, huge_page_size);
+
+  // requested size > quarter of a block:
+  //   allocate requested size separately
+  req_sz = 12 * 1024;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  expected_memory_allocated = req_sz * N + Arena::kInlineSize;
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               expected_memory_allocated);
+
+  arena.Allocate(Arena::kInlineSize - 1);
+
+  // requested size < quarter of a block:
+  //   allocate a block with the default size, then try to use unused part
+  //   of the block. So one new block will be allocated for the first
+  //   Allocate(99) call. All the remaining calls won't lead to new allocation.
+  req_sz = 99;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  if (huge_page_size) {
+    ASSERT_TRUE(
+        CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+                             expected_memory_allocated + bsz) ||
+        CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+                             expected_memory_allocated + huge_page_size));
+  } else {
+    expected_memory_allocated += bsz;
+    ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+                 expected_memory_allocated);
+  }
+
+  // requested size > size of a block:
+  //   allocate requested size separately
+  expected_memory_allocated = arena.MemoryAllocatedBytes();
+  req_sz = 8 * 1024 * 1024;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  expected_memory_allocated += req_sz * N;
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               expected_memory_allocated);
+}
+
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+static void ApproximateMemoryUsageTest(size_t huge_page_size) {
+  const size_t kBlockSize = 4096;
+  const size_t kEntrySize = kBlockSize / 8;
+  const size_t kZero = 0;
+  Arena arena(kBlockSize, nullptr, huge_page_size);
+  ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
+
+  // allocate inline bytes
+  const size_t kAlignUnit = alignof(max_align_t);
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  arena.AllocateAligned(kAlignUnit);
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  arena.AllocateAligned(Arena::kInlineSize / 2 - (2 * kAlignUnit));
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  arena.AllocateAligned(Arena::kInlineSize / 2);
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - kAlignUnit);
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               Arena::kInlineSize);
+
+  auto num_blocks = kBlockSize / kEntrySize;
+
+  // first allocation
+  arena.AllocateAligned(kEntrySize);
+  EXPECT_FALSE(arena.IsInInlineBlock());
+  auto mem_usage = arena.MemoryAllocatedBytes();
+  if (huge_page_size) {
+    ASSERT_TRUE(
+        CheckMemoryAllocated(mem_usage, kBlockSize + Arena::kInlineSize) ||
+        CheckMemoryAllocated(mem_usage, huge_page_size + Arena::kInlineSize));
+  } else {
+    ASSERT_PRED2(CheckMemoryAllocated, mem_usage,
+                 kBlockSize + Arena::kInlineSize);
+  }
+  auto usage = arena.ApproximateMemoryUsage();
+  ASSERT_LT(usage, mem_usage);
+  for (size_t i = 1; i < num_blocks; ++i) {
+    arena.AllocateAligned(kEntrySize);
+    ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+    ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    EXPECT_FALSE(arena.IsInInlineBlock());
+    usage = arena.ApproximateMemoryUsage();
+  }
+  if (huge_page_size) {
+    ASSERT_TRUE(usage > mem_usage ||
+                usage + huge_page_size - kBlockSize == mem_usage);
+  } else {
+    ASSERT_GT(usage, mem_usage);
+  }
+}
+
+static void SimpleTest(size_t huge_page_size) {
+  std::vector<std::pair<size_t, char*>> allocated;
+  Arena arena(Arena::kMinBlockSize, nullptr, huge_page_size);
+  const int N = 100000;
+  size_t bytes = 0;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    size_t s;
+    if (i % (N / 10) == 0) {
+      s = i;
+    } else {
+      s = rnd.OneIn(4000)
+              ? rnd.Uniform(6000)
+              : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+    }
+    if (s == 0) {
+      // Our arena disallows size 0 allocations.
+      s = 1;
+    }
+    char* r;
+    if (rnd.OneIn(10)) {
+      r = arena.AllocateAligned(s);
+    } else {
+      r = arena.Allocate(s);
+    }
+
+    for (unsigned int b = 0; b < s; b++) {
+      // Fill the "i"th allocation with a known bit pattern
+      r[b] = i % 256;
+    }
+    bytes += s;
+    allocated.push_back(std::make_pair(s, r));
+    ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
+    if (i > N / 10) {
+      ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
+    }
+  }
+  for (unsigned int i = 0; i < allocated.size(); i++) {
+    size_t num_bytes = allocated[i].first;
+    const char* p = allocated[i].second;
+    for (unsigned int b = 0; b < num_bytes; b++) {
+      // Check the "i"th allocation for the known bit pattern
+      ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256));
+    }
+  }
+}
+}  // namespace
+
+TEST_F(ArenaTest, MemoryAllocatedBytes) {
+  MemoryAllocatedBytesTest(0);
+  MemoryAllocatedBytesTest(kHugePageSize);
+}
+
+TEST_F(ArenaTest, ApproximateMemoryUsage) {
+  ApproximateMemoryUsageTest(0);
+  ApproximateMemoryUsageTest(kHugePageSize);
+}
+
+TEST_F(ArenaTest, Simple) {
+  SimpleTest(0);
+  SimpleTest(kHugePageSize);
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/auto_roll_logger.cc b/src/rocksdb/util/auto_roll_logger.cc
new file mode 100644
index 00000000..ae6061ae
--- /dev/null
+++ b/src/rocksdb/util/auto_roll_logger.cc
@@ -0,0 +1,177 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "util/auto_roll_logger.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+// -- AutoRollLogger
+Status AutoRollLogger::ResetLogger() {
+  TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger");
+  status_ = env_->NewLogger(log_fname_, &logger_);
+  TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger");
+
+  if (!status_.ok()) {
+    return status_;
+  }
+
+  if (logger_->GetLogFileSize() == Logger::kDoNotSupportGetLogFileSize) {
+    status_ = Status::NotSupported(
+        "The underlying logger doesn't support GetLogFileSize()");
+  }
+  if (status_.ok()) {
+    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    ctime_ = cached_now;
+    cached_now_access_count = 0;
+  }
+
+  return status_;
+}
+
+void AutoRollLogger::RollLogFile() {
+  // This function is called when log is rotating. Two rotations
+  // can happen quickly (NowMicro returns same value). To not overwrite
+  // previous log file we increment by one micro second and try again.
+  uint64_t now = env_->NowMicros();
+  std::string old_fname;
+  do {
+    old_fname = OldInfoLogFileName(
+      dbname_, now, db_absolute_path_, db_log_dir_);
+    now++;
+  } while (env_->FileExists(old_fname).ok());
+  env_->RenameFile(log_fname_, old_fname);
+}
+
+std::string AutoRollLogger::ValistToString(const char* format,
+                                           va_list args) const {
+  // Any log messages longer than 1024 will get truncated.
+  // The user is responsible for chopping longer messages into multi line log
+  static const int MAXBUFFERSIZE = 1024;
+  char buffer[MAXBUFFERSIZE];
+
+  int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args);
+  (void) count;
+  assert(count >= 0);
+
+  return buffer;
+}
+
+void AutoRollLogger::LogInternal(const char* format, ...) {
+  mutex_.AssertHeld();
+  va_list args;
+  va_start(args, format);
+  logger_->Logv(format, args);
+  va_end(args);
+}
+
+void AutoRollLogger::Logv(const char* format, va_list ap) {
+  assert(GetStatus().ok());
+
+  std::shared_ptr<Logger> logger;
+  {
+    MutexLock l(&mutex_);
+    if ((kLogFileTimeToRoll > 0 && LogExpired()) ||
+        (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) {
+      RollLogFile();
+      Status s = ResetLogger();
+      if (!s.ok()) {
+        // can't really log the error if creating a new LOG file failed
+        return;
+      }
+
+      WriteHeaderInfo();
+    }
+
+    // pin down the current logger_ instance before releasing the mutex.
+    logger = logger_;
+  }
+
+  // Another thread could have put a new Logger instance into logger_ by now.
+  // However, since logger is still hanging on to the previous instance
+  // (reference count is not zero), we don't have to worry about it being
+  // deleted while we are accessing it.
+  // Note that logv itself is not mutex protected to allow maximum concurrency,
+  // as thread safety should have been handled by the underlying logger.
+  logger->Logv(format, ap);
+}
+
+void AutoRollLogger::WriteHeaderInfo() {
+  mutex_.AssertHeld();
+  for (auto& header : headers_) {
+    LogInternal("%s", header.c_str());
+  }
+}
+
+void AutoRollLogger::LogHeader(const char* format, va_list args) {
+  // header message are to be retained in memory. Since we cannot make any
+  // assumptions about the data contained in va_list, we will retain them as
+  // strings
+  va_list tmp;
+  va_copy(tmp, args);
+  std::string data = ValistToString(format, tmp);
+  va_end(tmp);
+
+  MutexLock l(&mutex_);
+  headers_.push_back(data);
+
+  // Log the original message to the current log
+  logger_->Logv(format, args);
+}
+
+bool AutoRollLogger::LogExpired() {
+  if (cached_now_access_count >= call_NowMicros_every_N_records_) {
+    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    cached_now_access_count = 0;
+  }
+
+  ++cached_now_access_count;
+  return cached_now >= ctime_ + kLogFileTimeToRoll;
+}
+#endif  // !ROCKSDB_LITE
+
+Status CreateLoggerFromOptions(const std::string& dbname,
+                               const DBOptions& options,
+                               std::shared_ptr<Logger>* logger) {
+  if (options.info_log) {
+    *logger = options.info_log;
+    return Status::OK();
+  }
+
+  Env* env = options.env;
+  std::string db_absolute_path;
+  env->GetAbsolutePath(dbname, &db_absolute_path);
+  std::string fname =
+      InfoLogFileName(dbname, db_absolute_path, options.db_log_dir);
+
+  env->CreateDirIfMissing(dbname);  // In case it does not exist
+  // Currently we only support roll by time-to-roll and log size
+#ifndef ROCKSDB_LITE
+  if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
+    AutoRollLogger* result = new AutoRollLogger(
+        env, dbname, options.db_log_dir, options.max_log_file_size,
+        options.log_file_time_to_roll, options.info_log_level);
+    Status s = result->GetStatus();
+    if (!s.ok()) {
+      delete result;
+    } else {
+      logger->reset(result);
+    }
+    return s;
+  }
+#endif  // !ROCKSDB_LITE
+  // Open a log file in the same directory as the db
+  env->RenameFile(fname,
+                  OldInfoLogFileName(dbname, env->NowMicros(), db_absolute_path,
+                                     options.db_log_dir));
+  auto s = env->NewLogger(fname, logger);
+  if (logger->get() != nullptr) {
+    (*logger)->SetInfoLogLevel(options.info_log_level);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/auto_roll_logger.h b/src/rocksdb/util/auto_roll_logger.h
new file mode 100644
index 00000000..64fce4d6
--- /dev/null
+++ b/src/rocksdb/util/auto_roll_logger.h
@@ -0,0 +1,145 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+#include <list>
+#include <string>
+
+#include "port/port.h"
+#include "port/util_logger.h"
+#include "util/filename.h"
+#include "util/mutexlock.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+// Rolls the log file by size and/or time
+class AutoRollLogger : public Logger {
+ public:
+  AutoRollLogger(Env* env, const std::string& dbname,
+                 const std::string& db_log_dir, size_t log_max_size,
+                 size_t log_file_time_to_roll,
+                 const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : Logger(log_level),
+        dbname_(dbname),
+        db_log_dir_(db_log_dir),
+        env_(env),
+        status_(Status::OK()),
+        kMaxLogFileSize(log_max_size),
+        kLogFileTimeToRoll(log_file_time_to_roll),
+        cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+        ctime_(cached_now),
+        cached_now_access_count(0),
+        call_NowMicros_every_N_records_(100),
+        mutex_() {
+    env->GetAbsolutePath(dbname, &db_absolute_path_);
+    log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
+    RollLogFile();
+    ResetLogger();
+  }
+
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override;
+
+  // Write a header entry to the log. All header information will be written
+  // again every time the log rolls over.
+  virtual void LogHeader(const char* format, va_list ap) override;
+
+  // check if the logger has encountered any problem.
+  Status GetStatus() {
+    return status_;
+  }
+
+  size_t GetLogFileSize() const override {
+    std::shared_ptr<Logger> logger;
+    {
+      MutexLock l(&mutex_);
+      // pin down the current logger_ instance before releasing the mutex.
+      logger = logger_;
+    }
+    return logger->GetLogFileSize();
+  }
+
+  void Flush() override {
+    std::shared_ptr<Logger> logger;
+    {
+      MutexLock l(&mutex_);
+      // pin down the current logger_ instance before releasing the mutex.
+      logger = logger_;
+    }
+    TEST_SYNC_POINT("AutoRollLogger::Flush:PinnedLogger");
+    if (logger) {
+      logger->Flush();
+    }
+  }
+
+  virtual ~AutoRollLogger() {
+    if (logger_ && !closed_) {
+      logger_->Close();
+    }
+  }
+
+  void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) {
+    call_NowMicros_every_N_records_ = call_NowMicros_every_N_records;
+  }
+
+  // Expose the log file path for testing purpose
+  std::string TEST_log_fname() const {
+    return log_fname_;
+  }
+
+  uint64_t TEST_ctime() const { return ctime_; }
+
+ protected:
+  // Implementation of Close()
+  virtual Status CloseImpl() override {
+    if (logger_) {
+      return logger_->Close();
+    } else {
+      return Status::OK();
+    }
+  }
+
+ private:
+  bool LogExpired();
+  Status ResetLogger();
+  void RollLogFile();
+  // Log message to logger without rolling
+  void LogInternal(const char* format, ...);
+  // Serialize the va_list to a string
+  std::string ValistToString(const char* format, va_list args) const;
+  // Write the logs marked as headers to the new log file
+  void WriteHeaderInfo();
+  std::string log_fname_; // Current active info log's file name.
+  std::string dbname_;
+  std::string db_log_dir_;
+  std::string db_absolute_path_;
+  Env* env_;
+  std::shared_ptr<Logger> logger_;
+  // current status of the logger
+  Status status_;
+  const size_t kMaxLogFileSize;
+  const size_t kLogFileTimeToRoll;
+  // header information
+  std::list<std::string> headers_;
+  // to avoid frequent env->NowMicros() calls, we cached the current time
+  uint64_t cached_now;
+  uint64_t ctime_;
+  uint64_t cached_now_access_count;
+  uint64_t call_NowMicros_every_N_records_;
+  mutable port::Mutex mutex_;
+};
+#endif  // !ROCKSDB_LITE
+
+// Facade to craete logger automatically
+Status CreateLoggerFromOptions(const std::string& dbname,
+                               const DBOptions& options,
+                               std::shared_ptr<Logger>* logger);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/auto_roll_logger_test.cc b/src/rocksdb/util/auto_roll_logger_test.cc
new file mode 100644
index 00000000..ab9e0595
--- /dev/null
+++ b/src/rocksdb/util/auto_roll_logger_test.cc
@@ -0,0 +1,530 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "util/auto_roll_logger.h"
+#include <errno.h>
+#include <sys/stat.h>
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <thread>
+#include <vector>
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "util/logging.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace {
+class NoSleepEnv : public EnvWrapper {
+ public:
+  NoSleepEnv(Env* base) : EnvWrapper(base) {}
+  void SleepForMicroseconds(int micros) override {
+    fake_time_ += static_cast<uint64_t>(micros);
+  }
+
+  uint64_t NowNanos() override { return fake_time_ * 1000; }
+
+  uint64_t NowMicros() override { return fake_time_; }
+
+ private:
+  uint64_t fake_time_ = 6666666666;
+};
+}  // namespace
+
+class AutoRollLoggerTest : public testing::Test {
+ public:
+  static void InitTestDb() {
+#ifdef OS_WIN
+    // Replace all slashes in the path so windows CompSpec does not
+    // become confused
+    std::string testDir(kTestDir);
+    std::replace_if(testDir.begin(), testDir.end(),
+                    [](char ch) { return ch == '/'; }, '\\');
+    std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir;
+#else
+    std::string deleteCmd = "rm -rf " + kTestDir;
+#endif
+    ASSERT_TRUE(system(deleteCmd.c_str()) == 0);
+    Env::Default()->CreateDir(kTestDir);
+  }
+
+  void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size,
+                             const std::string& log_message);
+  void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time,
+                             const std::string& log_message);
+
+  static const std::string kSampleMessage;
+  static const std::string kTestDir;
+  static const std::string kLogFile;
+  static Env* default_env;
+};
+
+const std::string AutoRollLoggerTest::kSampleMessage(
+    "this is the message to be written to the log file!!");
+const std::string AutoRollLoggerTest::kTestDir(
+    test::PerThreadDBPath("db_log_test"));
+const std::string AutoRollLoggerTest::kLogFile(
+    test::PerThreadDBPath("db_log_test") + "/LOG");
+Env* AutoRollLoggerTest::default_env = Env::Default();
+
+// In this test we only want to Log some simple log message with
+// no format. LogMessage() provides such a simple interface and
+// avoids the [format-security] warning which occurs when you
+// call ROCKS_LOG_INFO(logger, log_message) directly.
+namespace {
+void LogMessage(Logger* logger, const char* message) {
+  ROCKS_LOG_INFO(logger, "%s", message);
+}
+
+void LogMessage(const InfoLogLevel log_level, Logger* logger,
+                const char* message) {
+  Log(log_level, logger, "%s", message);
+}
+}  // namespace
+
+void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
+                                               size_t log_max_size,
+                                               const std::string& log_message) {
+  logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+  // measure the size of each message, which is supposed
+  // to be equal or greater than log_message.size()
+  LogMessage(logger, log_message.c_str());
+  size_t message_size = logger->GetLogFileSize();
+  size_t current_log_size = message_size;
+
+  // Test the cases when the log file will not be rolled.
+  while (current_log_size + message_size < log_max_size) {
+    LogMessage(logger, log_message.c_str());
+    current_log_size += message_size;
+    ASSERT_EQ(current_log_size, logger->GetLogFileSize());
+  }
+
+  // Now the log file will be rolled
+  LogMessage(logger, log_message.c_str());
+  // Since rotation is checked before actual logging, we need to
+  // trigger the rotation by logging another message.
+  LogMessage(logger, log_message.c_str());
+
+  ASSERT_TRUE(message_size == logger->GetLogFileSize());
+}
+
+void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger,
+                                               size_t time,
+                                               const std::string& log_message) {
+  uint64_t expected_ctime;
+  uint64_t actual_ctime;
+
+  uint64_t total_log_size;
+  EXPECT_OK(env->GetFileSize(kLogFile, &total_log_size));
+  expected_ctime = logger->TEST_ctime();
+  logger->SetCallNowMicrosEveryNRecords(0);
+
+  // -- Write to the log for several times, which is supposed
+  // to be finished before time.
+  for (int i = 0; i < 10; ++i) {
+    env->SleepForMicroseconds(50000);
+    LogMessage(logger, log_message.c_str());
+    EXPECT_OK(logger->GetStatus());
+    // Make sure we always write to the same log file (by
+    // checking the create time);
+
+    actual_ctime = logger->TEST_ctime();
+
+    // Also make sure the log size is increasing.
+    EXPECT_EQ(expected_ctime, actual_ctime);
+    EXPECT_GT(logger->GetLogFileSize(), total_log_size);
+    total_log_size = logger->GetLogFileSize();
+  }
+
+  // -- Make the log file expire
+  env->SleepForMicroseconds(static_cast<int>(time * 1000000));
+  LogMessage(logger, log_message.c_str());
+
+  // At this time, the new log file should be created.
+  actual_ctime = logger->TEST_ctime();
+  EXPECT_LT(expected_ctime, actual_ctime);
+  EXPECT_LT(logger->GetLogFileSize(), total_log_size);
+}
+
+TEST_F(AutoRollLoggerTest, RollLogFileBySize) {
+    InitTestDb();
+    size_t log_max_size = 1024 * 5;
+
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0);
+
+    RollLogFileBySizeTest(&logger, log_max_size,
+                          kSampleMessage + ":RollLogFileBySize");
+}
+
+TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
+  NoSleepEnv nse(Env::Default());
+
+  size_t time = 2;
+  size_t log_size = 1024 * 5;
+
+  InitTestDb();
+  // -- Test the existence of file during the server restart.
+  ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile));
+  AutoRollLogger logger(&nse, kTestDir, "", log_size, time);
+  ASSERT_OK(default_env->FileExists(kLogFile));
+
+  RollLogFileByTimeTest(&nse, &logger, time,
+                        kSampleMessage + ":RollLogFileByTime");
+}
+
+TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) {
+  // If only 'log_max_size' options is specified, then every time
+  // when rocksdb is restarted, a new empty log file will be created.
+  InitTestDb();
+  // WORKAROUND:
+  // avoid complier's complaint of "comparison between signed
+  // and unsigned integer expressions" because literal 0 is
+  // treated as "singed".
+  size_t kZero = 0;
+  size_t log_size = 1024;
+
+  AutoRollLogger* logger = new AutoRollLogger(
+    Env::Default(), kTestDir, "", log_size, 0);
+
+  LogMessage(logger, kSampleMessage.c_str());
+  ASSERT_GT(logger->GetLogFileSize(), kZero);
+  delete logger;
+
+  // reopens the log file and an empty log file will be created.
+  logger = new AutoRollLogger(
+    Env::Default(), kTestDir, "", log_size, 0);
+  ASSERT_EQ(logger->GetLogFileSize(), kZero);
+  delete logger;
+}
+
+TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
+  size_t time = 2, log_max_size = 1024 * 5;
+
+  InitTestDb();
+
+  NoSleepEnv nse(Env::Default());
+  AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time);
+
+  // Test the ability to roll by size
+  RollLogFileBySizeTest(&logger, log_max_size,
+                        kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+
+  // Test the ability to roll by Time
+  RollLogFileByTimeTest(&nse, &logger, time,
+                        kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+}
+
+#ifndef OS_WIN
+// TODO: does not build for Windows because of PosixLogger use below. Need to
+// port
+TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
+  DBOptions options;
+  NoSleepEnv nse(Env::Default());
+  std::shared_ptr<Logger> logger;
+
+  // Normal logger
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  ASSERT_TRUE(dynamic_cast<PosixLogger*>(logger.get()));
+
+  // Only roll by size
+  InitTestDb();
+  options.max_log_file_size = 1024;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  AutoRollLogger* auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  ASSERT_TRUE(auto_roll_logger);
+  RollLogFileBySizeTest(
+      auto_roll_logger, options.max_log_file_size,
+      kSampleMessage + ":CreateLoggerFromOptions - size");
+
+  // Only roll by Time
+  options.env = &nse;
+  InitTestDb();
+  options.max_log_file_size = 0;
+  options.log_file_time_to_roll = 2;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
+                        kSampleMessage + ":CreateLoggerFromOptions - time");
+
+  // roll by both Time and size
+  InitTestDb();
+  options.max_log_file_size = 1024 * 5;
+  options.log_file_time_to_roll = 2;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  auto_roll_logger =
+    dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
+                        kSampleMessage + ":CreateLoggerFromOptions - both");
+  RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
+                        kSampleMessage + ":CreateLoggerFromOptions - both");
+}
+
+TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
+  DBOptions options;
+  std::shared_ptr<Logger> logger;
+
+  InitTestDb();
+  options.max_log_file_size = 1024 * 5;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  AutoRollLogger* auto_roll_logger =
+      dynamic_cast<AutoRollLogger*>(logger.get());
+  ASSERT_TRUE(auto_roll_logger);
+  rocksdb::port::Thread flush_thread;
+
+  // Notes:
+  // (1) Need to pin the old logger before beginning the roll, as rolling grabs
+  //     the mutex, which would prevent us from accessing the old logger. This
+  //     also marks flush_thread with AutoRollLogger::Flush:PinnedLogger.
+  // (2) Need to reset logger during PosixLogger::Flush() to exercise a race
+  //     condition case, which is executing the flush with the pinned (old)
+  //     logger after auto-roll logger has cut over to a new logger.
+  // (3) PosixLogger::Flush() happens in both threads but its SyncPoints only
+  //     are enabled in flush_thread (the one pinning the old logger).
+  rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
+      {{"AutoRollLogger::Flush:PinnedLogger",
+        "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"},
+       {"PosixLogger::Flush:Begin1",
+        "AutoRollLogger::ResetLogger:BeforeNewLogger"},
+       {"AutoRollLogger::ResetLogger:AfterNewLogger",
+        "PosixLogger::Flush:Begin2"}},
+      {{"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin1"},
+       {"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin2"}});
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  flush_thread = port::Thread ([&]() { auto_roll_logger->Flush(); });
+  TEST_SYNC_POINT(
+      "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit");
+  RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
+                        kSampleMessage + ":LogFlushWhileRolling");
+  flush_thread.join();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif  // OS_WIN
+
+TEST_F(AutoRollLoggerTest, InfoLogLevel) {
+  InitTestDb();
+
+  size_t log_size = 8192;
+  size_t log_lines = 0;
+  // an extra-scope to force the AutoRollLogger to flush the log file when it
+  // becomes out of scope.
+  {
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+    for (int log_level = InfoLogLevel::HEADER_LEVEL;
+         log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+      logger.SetInfoLogLevel((InfoLogLevel)log_level);
+      for (int log_type = InfoLogLevel::DEBUG_LEVEL;
+           log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) {
+        // log messages with log level smaller than log_level will not be
+        // logged.
+        LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
+      }
+      log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+    }
+    for (int log_level = InfoLogLevel::HEADER_LEVEL;
+         log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+      logger.SetInfoLogLevel((InfoLogLevel)log_level);
+
+      // again, messages with level smaller than log_level will not be logged.
+      ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+      log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+    }
+  }
+  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
+  size_t lines = std::count(std::istreambuf_iterator<char>(inFile),
+                         std::istreambuf_iterator<char>(), '\n');
+  ASSERT_EQ(log_lines, lines);
+  inFile.close();
+}
+
+TEST_F(AutoRollLoggerTest, Close) {
+  InitTestDb();
+
+  size_t log_size = 8192;
+  size_t log_lines = 0;
+  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+  for (int log_level = InfoLogLevel::HEADER_LEVEL;
+       log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+    logger.SetInfoLogLevel((InfoLogLevel)log_level);
+    for (int log_type = InfoLogLevel::DEBUG_LEVEL;
+         log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) {
+      // log messages with log level smaller than log_level will not be
+      // logged.
+      LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
+    }
+    log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+  }
+  for (int log_level = InfoLogLevel::HEADER_LEVEL;
+       log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+    logger.SetInfoLogLevel((InfoLogLevel)log_level);
+
+    // again, messages with level smaller than log_level will not be logged.
+    ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+    log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+  }
+  ASSERT_EQ(logger.Close(), Status::OK());
+
+  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
+  size_t lines = std::count(std::istreambuf_iterator<char>(inFile),
+                         std::istreambuf_iterator<char>(), '\n');
+  ASSERT_EQ(log_lines, lines);
+  inFile.close();
+}
+
+// Test the logger Header function for roll over logs
+// We expect the new logs creates as roll over to carry the headers specified
+static std::vector<std::string> GetOldFileNames(const std::string& path) {
+  std::vector<std::string> ret;
+
+  const std::string dirname = path.substr(/*start=*/0, path.find_last_of("/"));
+  const std::string fname = path.substr(path.find_last_of("/") + 1);
+
+  std::vector<std::string> children;
+  Env::Default()->GetChildren(dirname, &children);
+
+  // We know that the old log files are named [path]<something>
+  // Return all entities that match the pattern
+  for (auto& child : children) {
+    if (fname != child && child.find(fname) == 0) {
+      ret.push_back(dirname + "/" + child);
+    }
+  }
+
+  return ret;
+}
+
+// Return the number of lines where a given pattern was found in the file
+static size_t GetLinesCount(const std::string& fname,
+                            const std::string& pattern) {
+  std::stringstream ssbuf;
+  std::string line;
+  size_t count = 0;
+
+  std::ifstream inFile(fname.c_str());
+  ssbuf << inFile.rdbuf();
+
+  while (getline(ssbuf, line)) {
+    if (line.find(pattern) != std::string::npos) {
+      count++;
+    }
+  }
+
+  return count;
+}
+
+TEST_F(AutoRollLoggerTest, LogHeaderTest) {
+  static const size_t MAX_HEADERS = 10;
+  static const size_t LOG_MAX_SIZE = 1024 * 5;
+  static const std::string HEADER_STR = "Log header line";
+
+  // test_num == 0 -> standard call to Header()
+  // test_num == 1 -> call to Log() with InfoLogLevel::HEADER_LEVEL
+  for (int test_num = 0; test_num < 2; test_num++) {
+
+    InitTestDb();
+
+    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "",
+                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0);
+
+    if (test_num == 0) {
+      // Log some headers explicitly using Header()
+      for (size_t i = 0; i < MAX_HEADERS; i++) {
+        Header(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i);
+      }
+    } else if (test_num == 1) {
+      // HEADER_LEVEL should make this behave like calling Header()
+      for (size_t i = 0; i < MAX_HEADERS; i++) {
+        ROCKS_LOG_HEADER(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i);
+      }
+    }
+
+    const std::string newfname = logger.TEST_log_fname();
+
+    // Log enough data to cause a roll over
+    int i = 0;
+    for (size_t iter = 0; iter < 2; iter++) {
+      while (logger.GetLogFileSize() < LOG_MAX_SIZE) {
+        Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i);
+        ++i;
+      }
+
+      Info(&logger, "Rollover");
+    }
+
+    // Flush the log for the latest file
+    LogFlush(&logger);
+
+    const auto oldfiles = GetOldFileNames(newfname);
+
+    ASSERT_EQ(oldfiles.size(), (size_t) 2);
+
+    for (auto& oldfname : oldfiles) {
+      // verify that the files rolled over
+      ASSERT_NE(oldfname, newfname);
+      // verify that the old log contains all the header logs
+      ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
+    }
+  }
+}
+
+TEST_F(AutoRollLoggerTest, LogFileExistence) {
+  rocksdb::DB* db;
+  rocksdb::Options options;
+#ifdef OS_WIN
+  // Replace all slashes in the path so windows CompSpec does not
+  // become confused
+  std::string testDir(kTestDir);
+  std::replace_if(testDir.begin(), testDir.end(),
+    [](char ch) { return ch == '/'; }, '\\');
+  std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir;
+#else
+  std::string deleteCmd = "rm -rf " + kTestDir;
+#endif
+  ASSERT_EQ(system(deleteCmd.c_str()), 0);
+  options.max_log_file_size = 100 * 1024 * 1024;
+  options.create_if_missing = true;
+  ASSERT_OK(rocksdb::DB::Open(options, kTestDir, &db));
+  ASSERT_OK(default_env->FileExists(kLogFile));
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as AutoRollLogger is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/util/autovector.h b/src/rocksdb/util/autovector.h
new file mode 100644
index 00000000..5843fa8a
--- /dev/null
+++ b/src/rocksdb/util/autovector.h
@@ -0,0 +1,366 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <iterator>
+#include <stdexcept>
+#include <vector>
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_LITE
+template <class T, size_t kSize = 8>
+class autovector : public std::vector<T> {
+  using std::vector<T>::vector;
+};
+#else
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+//  * reserve()/shrink_to_fit()
+//     If used correctly, in most cases, people should not touch the
+//     underlying vector at all.
+//  * random insert()/erase(), please only use push_back()/pop_back().
+//  * No move/swap operations. Each autovector instance has a
+//     stack-allocated array and if we want support move/swap operations, we
+//     need to copy the arrays other than just swapping the pointers. In this
+//     case we'll just explicitly forbid these operations since they may
+//     lead users to make false assumption by thinking they are inexpensive
+//     operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+  // General STL-style container member types.
+  typedef T value_type;
+  typedef typename std::vector<T>::difference_type difference_type;
+  typedef typename std::vector<T>::size_type size_type;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+
+  // This class is the base for regular/const iterator
+  template <class TAutoVector, class TValueType>
+  class iterator_impl {
+   public:
+    // -- iterator traits
+    typedef iterator_impl<TAutoVector, TValueType> self_type;
+    typedef TValueType value_type;
+    typedef TValueType& reference;
+    typedef TValueType* pointer;
+    typedef typename TAutoVector::difference_type difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+
+    iterator_impl(TAutoVector* vect, size_t index)
+        : vect_(vect), index_(index) {};
+    iterator_impl(const iterator_impl&) = default;
+    ~iterator_impl() {}
+    iterator_impl& operator=(const iterator_impl&) = default;
+
+    // -- Advancement
+    // ++iterator
+    self_type& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // iterator++
+    self_type operator++(int) {
+      auto old = *this;
+      ++index_;
+      return old;
+    }
+
+    // --iterator
+    self_type& operator--() {
+      --index_;
+      return *this;
+    }
+
+    // iterator--
+    self_type operator--(int) {
+      auto old = *this;
+      --index_;
+      return old;
+    }
+
+    self_type operator-(difference_type len) const {
+      return self_type(vect_, index_ - len);
+    }
+
+    difference_type operator-(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ - other.index_;
+    }
+
+    self_type operator+(difference_type len) const {
+      return self_type(vect_, index_ + len);
+    }
+
+    self_type& operator+=(difference_type len) {
+      index_ += len;
+      return *this;
+    }
+
+    self_type& operator-=(difference_type len) {
+      index_ -= len;
+      return *this;
+    }
+
+    // -- Reference
+    reference operator*() {
+      assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    const_reference operator*() const {
+      assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    pointer operator->() {
+      assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+    const_pointer operator->() const {
+      assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+
+    // -- Logical Operators
+    bool operator==(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ == other.index_;
+    }
+
+    bool operator!=(const self_type& other) const { return !(*this == other); }
+
+    bool operator>(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ > other.index_;
+    }
+
+    bool operator<(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ < other.index_;
+    }
+
+    bool operator>=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ >= other.index_;
+    }
+
+    bool operator<=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ <= other.index_;
+    }
+
+   private:
+    TAutoVector* vect_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  typedef iterator_impl<autovector, value_type> iterator;
+  typedef iterator_impl<const autovector, const value_type> const_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  autovector() : values_(reinterpret_cast<pointer>(buf_)) {}
+
+  autovector(std::initializer_list<T> init_list)
+      : values_(reinterpret_cast<pointer>(buf_)) {
+    for (const T& item : init_list) {
+      push_back(item);
+    }
+  }
+
+  ~autovector() { clear(); }
+
+  // -- Immutable operations
+  // Indicate if all data resides in in-stack data structure.
+  bool only_in_stack() const {
+    // If no element was inserted at all, the vector's capacity will be `0`.
+    return vect_.capacity() == 0;
+  }
+
+  size_type size() const { return num_stack_items_ + vect_.size(); }
+
+  // resize does not guarantee anything about the contents of the newly
+  // available elements
+  void resize(size_type n) {
+    if (n > kSize) {
+      vect_.resize(n - kSize);
+      while (num_stack_items_ < kSize) {
+        new ((void*)(&values_[num_stack_items_++])) value_type();
+      }
+      num_stack_items_ = kSize;
+    } else {
+      vect_.clear();
+      while (num_stack_items_ < n) {
+        new ((void*)(&values_[num_stack_items_++])) value_type();
+      }
+      while (num_stack_items_ > n) {
+        values_[--num_stack_items_].~value_type();
+      }
+    }
+  }
+
+  bool empty() const { return size() == 0; }
+
+  const_reference operator[](size_type n) const {
+    assert(n < size());
+    if (n < kSize) {
+      return values_[n];
+    }
+    return vect_[n - kSize];
+  }
+
+  reference operator[](size_type n) {
+    assert(n < size());
+    if (n < kSize) {
+      return values_[n];
+    }
+    return vect_[n - kSize];
+  }
+
+  const_reference at(size_type n) const {
+    assert(n < size());
+    return (*this)[n];
+  }
+
+  reference at(size_type n) {
+    assert(n < size());
+    return (*this)[n];
+  }
+
+  reference front() {
+    assert(!empty());
+    return *begin();
+  }
+
+  const_reference front() const {
+    assert(!empty());
+    return *begin();
+  }
+
+  reference back() {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  const_reference back() const {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  // -- Mutable Operations
+  void push_back(T&& item) {
+    if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_])) value_type();
+      values_[num_stack_items_++] = std::move(item);
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  void push_back(const T& item) {
+    if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_])) value_type();
+      values_[num_stack_items_++] = item;
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  template <class... Args>
+  void emplace_back(Args&&... args) {
+    if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_++]))
+          value_type(std::forward<Args>(args)...);
+    } else {
+      vect_.emplace_back(std::forward<Args>(args)...);
+    }
+  }
+
+  void pop_back() {
+    assert(!empty());
+    if (!vect_.empty()) {
+      vect_.pop_back();
+    } else {
+      values_[--num_stack_items_].~value_type();
+    }
+  }
+
+  void clear() {
+    while (num_stack_items_ > 0) {
+      values_[--num_stack_items_].~value_type();
+    }
+    vect_.clear();
+  }
+
+  // -- Copy and Assignment
+  autovector& assign(const autovector& other);
+
+  autovector(const autovector& other) { assign(other); }
+
+  autovector& operator=(const autovector& other) { return assign(other); }
+
+  // -- Iterator Operations
+  iterator begin() { return iterator(this, 0); }
+
+  const_iterator begin() const { return const_iterator(this, 0); }
+
+  iterator end() { return iterator(this, this->size()); }
+
+  const_iterator end() const { return const_iterator(this, this->size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+ private:
+  size_type num_stack_items_ = 0;  // current number of items
+  alignas(alignof(
+      value_type)) char buf_[kSize *
+                             sizeof(value_type)];  // the first `kSize` items
+  pointer values_;
+  // used only if there are more than `kSize` items.
+  std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+  values_ = reinterpret_cast<pointer>(buf_);
+  // copy the internal vector
+  vect_.assign(other.vect_.begin(), other.vect_.end());
+
+  // copy array
+  num_stack_items_ = other.num_stack_items_;
+  std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+  return *this;
+}
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/autovector_test.cc b/src/rocksdb/util/autovector_test.cc
new file mode 100644
index 00000000..13299669
--- /dev/null
+++ b/src/rocksdb/util/autovector_test.cc
@@ -0,0 +1,330 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <iostream>
+#include <string>
+#include <utility>
+
+#include "rocksdb/env.h"
+#include "util/autovector.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+using std::cout;
+using std::endl;
+
+namespace rocksdb {
+
+class AutoVectorTest : public testing::Test {};
+const unsigned long kSize = 8;
+
+namespace {
+template <class T>
+void AssertAutoVectorOnlyInStack(autovector<T, kSize>* vec, bool result) {
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(vec->only_in_stack(), result);
+#else
+  (void) vec;
+  (void) result;
+#endif  // !ROCKSDB_LITE
+}
+}  // namespace
+
+TEST_F(AutoVectorTest, PushBackAndPopBack) {
+  autovector<size_t, kSize> vec;
+  ASSERT_TRUE(vec.empty());
+  ASSERT_EQ(0ul, vec.size());
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.push_back(i);
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      AssertAutoVectorOnlyInStack(&vec, true);
+    } else {
+      AssertAutoVectorOnlyInStack(&vec, false);
+    }
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i]);
+    ASSERT_EQ(i, vec.at(i));
+  }
+
+  size_t size = vec.size();
+  while (size != 0) {
+    vec.pop_back();
+    // will always be in heap
+    AssertAutoVectorOnlyInStack(&vec, false);
+    ASSERT_EQ(--size, vec.size());
+  }
+
+  ASSERT_TRUE(vec.empty());
+}
+
+TEST_F(AutoVectorTest, EmplaceBack) {
+  typedef std::pair<size_t, std::string> ValType;
+  autovector<ValType, kSize> vec;
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.emplace_back(i, ToString(i + 123));
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      AssertAutoVectorOnlyInStack(&vec, true);
+    } else {
+      AssertAutoVectorOnlyInStack(&vec, false);
+    }
+
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i].first);
+    ASSERT_EQ(ToString(i + 123), vec[i].second);
+  }
+
+  vec.clear();
+  ASSERT_TRUE(vec.empty());
+  AssertAutoVectorOnlyInStack(&vec, false);
+}
+
+TEST_F(AutoVectorTest, Resize) {
+  autovector<size_t, kSize> vec;
+
+  vec.resize(kSize);
+  AssertAutoVectorOnlyInStack(&vec, true);
+  for (size_t i = 0; i < kSize; ++i) {
+    vec[i] = i;
+  }
+
+  vec.resize(kSize * 2);
+  AssertAutoVectorOnlyInStack(&vec, false);
+  for (size_t i = 0; i < kSize; ++i) {
+    ASSERT_EQ(vec[i], i);
+  }
+  for (size_t i = 0; i < kSize; ++i) {
+    vec[i + kSize] = i;
+  }
+
+  vec.resize(1);
+  ASSERT_EQ(1U, vec.size());
+}
+
+namespace {
+void AssertEqual(
+    const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) {
+  ASSERT_EQ(a.size(), b.size());
+  ASSERT_EQ(a.empty(), b.empty());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(a.only_in_stack(), b.only_in_stack());
+#endif  // !ROCKSDB_LITE
+  for (size_t i = 0; i < a.size(); ++i) {
+    ASSERT_EQ(a[i], b[i]);
+  }
+}
+}  // namespace
+
+TEST_F(AutoVectorTest, CopyAndAssignment) {
+  // Test both heap-allocated and stack-allocated cases.
+  for (auto size : { kSize / 2, kSize * 1000 }) {
+    autovector<size_t, kSize> vec;
+    for (size_t i = 0; i < size; ++i) {
+      vec.push_back(i);
+    }
+
+    {
+      autovector<size_t, kSize> other;
+      other = vec;
+      AssertEqual(other, vec);
+    }
+
+    {
+      autovector<size_t, kSize> other(vec);
+      AssertEqual(other, vec);
+    }
+  }
+}
+
+TEST_F(AutoVectorTest, Iterators) {
+  autovector<std::string, kSize> vec;
+  for (size_t i = 0; i < kSize * 1000; ++i) {
+    vec.push_back(ToString(i));
+  }
+
+  // basic operator test
+  ASSERT_EQ(vec.front(), *vec.begin());
+  ASSERT_EQ(vec.back(), *(vec.end() - 1));
+  ASSERT_TRUE(vec.begin() < vec.end());
+
+  // non-const iterator
+  size_t index = 0;
+  for (const auto& item : vec) {
+    ASSERT_EQ(vec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) {
+    ASSERT_EQ(vec[index--], *pos);
+  }
+
+  // const iterator
+  const auto& cvec = vec;
+  index = 0;
+  for (const auto& item : cvec) {
+    ASSERT_EQ(cvec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) {
+    ASSERT_EQ(cvec[index--], *pos);
+  }
+
+  // forward and backward
+  auto pos = vec.begin();
+  while (pos != vec.end()) {
+    auto old_val = *pos;
+    auto old = pos++;
+    // HACK: make sure -> works
+    ASSERT_TRUE(!old->empty());
+    ASSERT_EQ(old_val, *old);
+    ASSERT_TRUE(pos == vec.end() || old_val != *pos);
+  }
+
+  pos = vec.begin();
+  for (size_t i = 0; i < vec.size(); i += 2) {
+    // Cannot use ASSERT_EQ since that macro depends on iostream serialization
+    ASSERT_TRUE(pos + 2 - 2 == pos);
+    pos += 2;
+    ASSERT_TRUE(pos >= vec.begin());
+    ASSERT_TRUE(pos <= vec.end());
+
+    size_t diff = static_cast<size_t>(pos - vec.begin());
+    ASSERT_EQ(i + 2, diff);
+  }
+}
+
+namespace {
+std::vector<std::string> GetTestKeys(size_t size) {
+  std::vector<std::string> keys;
+  keys.resize(size);
+
+  int index = 0;
+  for (auto& key : keys) {
+    key = "item-" + rocksdb::ToString(index++);
+  }
+  return keys;
+}
+}  // namespace
+
+template <class TVector>
+void BenchmarkVectorCreationAndInsertion(
+    std::string name, size_t ops, size_t item_size,
+    const std::vector<typename TVector::value_type>& items) {
+  auto env = Env::Default();
+
+  int index = 0;
+  auto start_time = env->NowNanos();
+  auto ops_remaining = ops;
+  while(ops_remaining--) {
+    TVector v;
+    for (size_t i = 0; i < item_size; ++i) {
+      v.push_back(items[index++]);
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "created " << ops << " " << name << " instances:\n\t"
+       << "each was inserted with " << item_size << " elements\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+}
+
+template <class TVector>
+size_t BenchmarkSequenceAccess(std::string name, size_t ops, size_t elem_size) {
+  TVector v;
+  for (const auto& item : GetTestKeys(elem_size)) {
+    v.push_back(item);
+  }
+  auto env = Env::Default();
+
+  auto ops_remaining = ops;
+  auto start_time = env->NowNanos();
+  size_t total = 0;
+  while (ops_remaining--) {
+    auto end = v.end();
+    for (auto pos = v.begin(); pos != end; ++pos) {
+      total += pos->size();
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "performed " << ops << " sequence access against " << name << "\n\t"
+       << "size: " << elem_size << "\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+  // HACK avoid compiler's optimization to ignore total
+  return total;
+}
+
+// This test case only reports the performance between std::vector<std::string>
+// and autovector<std::string>. We chose string for comparison because in most
+// of our use cases we used std::vector<std::string>.
+TEST_F(AutoVectorTest, PerfBench) {
+  // We run same operations for kOps times in order to get a more fair result.
+  size_t kOps = 100000;
+
+  // Creation and insertion test
+  // Test the case when there is:
+  //  * no element inserted: internal array of std::vector may not really get
+  //    initialize.
+  //  * one element inserted: internal array of std::vector must have
+  //    initialized.
+  //  * kSize elements inserted. This shows the most time we'll spend if we
+  //    keep everything in stack.
+  //  * 2 * kSize elements inserted. The internal vector of
+  //    autovector must have been initialized.
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: std::string)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  auto string_keys = GetTestKeys(kOps * 2 * kSize);
+  for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkVectorCreationAndInsertion<std::vector<std::string>>(
+        "std::vector<std::string>", kOps, insertions, string_keys);
+    BenchmarkVectorCreationAndInsertion<autovector<std::string, kSize>>(
+        "autovector<std::string>", kOps, insertions, string_keys);
+    cout << "-----------------------------------" << endl;
+  }
+
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: uint64_t)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  std::vector<uint64_t> int_keys(kOps * 2 * kSize);
+  for (size_t i = 0; i < kOps * 2 * kSize; ++i) {
+    int_keys[i] = i;
+  }
+  for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkVectorCreationAndInsertion<std::vector<uint64_t>>(
+        "std::vector<uint64_t>", kOps, insertions, int_keys);
+    BenchmarkVectorCreationAndInsertion<autovector<uint64_t, kSize>>(
+      "autovector<uint64_t>", kOps, insertions, int_keys
+    );
+    cout << "-----------------------------------" << endl;
+  }
+
+  // Sequence Access Test
+  cout << "=====================================================" << endl;
+  cout << "Sequence Access Test" << endl;
+  cout << "=====================================================" << endl;
+  for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) {
+    BenchmarkSequenceAccess<std::vector<std::string>>("std::vector", kOps,
+                                                      elem_size);
+    BenchmarkSequenceAccess<autovector<std::string, kSize>>("autovector", kOps,
+                                                            elem_size);
+    cout << "-----------------------------------" << endl;
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/bloom.cc b/src/rocksdb/util/bloom.cc
new file mode 100644
index 00000000..9c05f710
--- /dev/null
+++ b/src/rocksdb/util/bloom.cc
@@ -0,0 +1,372 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+#include "rocksdb/slice.h"
+#include "table/block_based_filter_block.h"
+#include "table/full_filter_bits_builder.h"
+#include "table/full_filter_block.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+class BlockBasedFilterBlockBuilder;
+class FullFilterBlockBuilder;
+
+FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
+                                             const size_t num_probes)
+    : bits_per_key_(bits_per_key), num_probes_(num_probes) {
+  assert(bits_per_key_);
+  }
+
+  FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
+
+  void FullFilterBitsBuilder::AddKey(const Slice& key) {
+    uint32_t hash = BloomHash(key);
+    if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+      hash_entries_.push_back(hash);
+    }
+  }
+
+  Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+    uint32_t total_bits, num_lines;
+    char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
+                              &total_bits, &num_lines);
+    assert(data);
+
+    if (total_bits != 0 && num_lines != 0) {
+      for (auto h : hash_entries_) {
+        AddHash(h, data, num_lines, total_bits);
+      }
+    }
+    data[total_bits/8] = static_cast<char>(num_probes_);
+    EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
+
+    const char* const_data = data;
+    buf->reset(const_data);
+    hash_entries_.clear();
+
+    return Slice(data, total_bits / 8 + 5);
+  }
+
+uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_lines =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_lines an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_lines % 2 == 0) {
+    num_lines++;
+  }
+  return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
+                                               uint32_t* total_bits,
+                                               uint32_t* num_lines) {
+  assert(bits_per_key_);
+  if (num_entry != 0) {
+    uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
+
+    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+    assert(*total_bits > 0 && *total_bits % 8 == 0);
+  } else {
+    // filter is empty, just leave space for metadata
+    *total_bits = 0;
+    *num_lines = 0;
+  }
+
+  // Reserve space for Filter
+  uint32_t sz = *total_bits / 8;
+  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
+  return sz;
+}
+
+char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
+                                          uint32_t* total_bits,
+                                          uint32_t* num_lines) {
+  uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
+  char* data = new char[sz];
+  memset(data, 0, sz);
+  return data;
+}
+
+int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
+  assert(bits_per_key_);
+  assert(space > 0);
+  uint32_t dont_care1, dont_care2;
+  int high = (int) (space * 8 / bits_per_key_ + 1);
+  int low = 1;
+  int n = high;
+  for (; n >= low; n--) {
+    uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
+    if (sz <= space) {
+      break;
+    }
+  }
+  assert(n < high);  // High should be an overestimation
+  return n;
+}
+
+inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
+    uint32_t num_lines, uint32_t total_bits) {
+#ifdef NDEBUG
+  (void)total_bits;
+#endif
+  assert(num_lines > 0 && total_bits > 0);
+
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
+
+  for (uint32_t i = 0; i < num_probes_; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    // to a simple operation by compiler.
+    const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+    data[bitpos / 8] |= (1 << (bitpos % 8));
+
+    h += delta;
+  }
+}
+
+namespace {
+class FullFilterBitsReader : public FilterBitsReader {
+ public:
+  explicit FullFilterBitsReader(const Slice& contents)
+      : data_(const_cast<char*>(contents.data())),
+        data_len_(static_cast<uint32_t>(contents.size())),
+        num_probes_(0),
+        num_lines_(0),
+        log2_cache_line_size_(0) {
+    assert(data_);
+    GetFilterMeta(contents, &num_probes_, &num_lines_);
+    // Sanitize broken parameter
+    if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
+      num_lines_ = 0;
+      num_probes_ = 0;
+    } else if (num_lines_ != 0) {
+      while (true) {
+        uint32_t num_lines_at_curr_cache_size =
+            (data_len_ - 5) >> log2_cache_line_size_;
+        if (num_lines_at_curr_cache_size == 0) {
+          // The cache line size seems not a power of two. It's not supported
+          // and indicates a corruption so disable using this filter.
+          assert(false);
+          num_lines_ = 0;
+          num_probes_ = 0;
+          break;
+        }
+        if (num_lines_at_curr_cache_size == num_lines_) {
+          break;
+        }
+        ++log2_cache_line_size_;
+      }
+    }
+  }
+
+  ~FullFilterBitsReader() override {}
+
+  bool MayMatch(const Slice& entry) override {
+    if (data_len_ <= 5) {   // remain same with original filter
+      return false;
+    }
+    // Other Error params, including a broken filter, regarded as match
+    if (num_probes_ == 0 || num_lines_ == 0) return true;
+    uint32_t hash = BloomHash(entry);
+    return HashMayMatch(hash, Slice(data_, data_len_),
+                        num_probes_, num_lines_);
+  }
+
+ private:
+  // Filter meta data
+  char* data_;
+  uint32_t data_len_;
+  size_t num_probes_;
+  uint32_t num_lines_;
+  uint32_t log2_cache_line_size_;
+
+  // Get num_probes, and num_lines from filter
+  // If filter format broken, set both to 0.
+  void GetFilterMeta(const Slice& filter, size_t* num_probes,
+                             uint32_t* num_lines);
+
+  // "filter" contains the data appended by a preceding call to
+  // FilterBitsBuilder::Finish. This method must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  //
+  // hash: target to be checked
+  // filter: the whole filter, including meta data bytes
+  // num_probes: number of probes, read before hand
+  // num_lines: filter metadata, read before hand
+  // Before calling this function, need to ensure the input meta data
+  // is valid.
+  bool HashMayMatch(const uint32_t& hash, const Slice& filter,
+      const size_t& num_probes, const uint32_t& num_lines);
+
+  // No Copy allowed
+  FullFilterBitsReader(const FullFilterBitsReader&);
+  void operator=(const FullFilterBitsReader&);
+};
+
+void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
+    size_t* num_probes, uint32_t* num_lines) {
+  uint32_t len = static_cast<uint32_t>(filter.size());
+  if (len <= 5) {
+    // filter is empty or broken
+    *num_probes = 0;
+    *num_lines = 0;
+    return;
+  }
+
+  *num_probes = filter.data()[len - 5];
+  *num_lines = DecodeFixed32(filter.data() + len - 4);
+}
+
+bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
+    const Slice& filter, const size_t& num_probes,
+    const uint32_t& num_lines) {
+  uint32_t len = static_cast<uint32_t>(filter.size());
+  if (len <= 5) return false;  // remain the same with original filter
+
+  // It is ensured the params are valid before calling it
+  assert(num_probes != 0);
+  assert(num_lines != 0 && (len - 5) % num_lines == 0);
+  const char* data = filter.data();
+
+  uint32_t h = hash;
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  // Left shift by an extra 3 to convert bytes to bits
+  uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
+  PREFETCH(&data[b / 8], 0 /* rw */, 1 /* locality */);
+  PREFETCH(&data[b / 8 + (1 << log2_cache_line_size_) - 1], 0 /* rw */,
+           1 /* locality */);
+
+  for (uint32_t i = 0; i < num_probes; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    //  to a simple and operation by compiler.
+    const uint32_t bitpos = b + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
+    if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+      return false;
+    }
+
+    h += delta;
+  }
+
+  return true;
+}
+
+// An implementation of filter policy
+class BloomFilterPolicy : public FilterPolicy {
+ public:
+  explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
+      : bits_per_key_(bits_per_key), hash_func_(BloomHash),
+        use_block_based_builder_(use_block_based_builder) {
+    initialize();
+  }
+
+  ~BloomFilterPolicy() override {}
+
+  const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
+
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    // Compute bloom filter size (in both bits and bytes)
+    size_t bits = n * bits_per_key_;
+
+    // For small n, we can see a very high false positive rate.  Fix it
+    // by enforcing a minimum bloom filter length.
+    if (bits < 64) bits = 64;
+
+    size_t bytes = (bits + 7) / 8;
+    bits = bytes * 8;
+
+    const size_t init_size = dst->size();
+    dst->resize(init_size + bytes, 0);
+    dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
+    char* array = &(*dst)[init_size];
+    for (size_t i = 0; i < (size_t)n; i++) {
+      // Use double-hashing to generate a sequence of hash values.
+      // See analysis in [Kirsch,Mitzenmacher 2006].
+      uint32_t h = hash_func_(keys[i]);
+      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+      for (size_t j = 0; j < num_probes_; j++) {
+        const uint32_t bitpos = h % bits;
+        array[bitpos/8] |= (1 << (bitpos % 8));
+        h += delta;
+      }
+    }
+  }
+
+  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
+    const size_t len = bloom_filter.size();
+    if (len < 2) return false;
+
+    const char* array = bloom_filter.data();
+    const size_t bits = (len - 1) * 8;
+
+    // Use the encoded k so that we can read filters generated by
+    // bloom filters created using different parameters.
+    const size_t k = array[len-1];
+    if (k > 30) {
+      // Reserved for potentially new encodings for short bloom filters.
+      // Consider it a match.
+      return true;
+    }
+
+    uint32_t h = hash_func_(key);
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (size_t j = 0; j < k; j++) {
+      const uint32_t bitpos = h % bits;
+      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
+      h += delta;
+    }
+    return true;
+  }
+
+  FilterBitsBuilder* GetFilterBitsBuilder() const override {
+    if (use_block_based_builder_) {
+      return nullptr;
+    }
+
+    return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    return new FullFilterBitsReader(contents);
+  }
+
+  // If choose to use block based builder
+  bool UseBlockBasedBuilder() { return use_block_based_builder_; }
+
+ private:
+  size_t bits_per_key_;
+  size_t num_probes_;
+  uint32_t (*hash_func_)(const Slice& key);
+
+  const bool use_block_based_builder_;
+
+  void initialize() {
+    // We intentionally round down to reduce probing cost a little bit
+    num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    if (num_probes_ < 1) num_probes_ = 1;
+    if (num_probes_ > 30) num_probes_ = 30;
+  }
+};
+
+}  // namespace
+
+const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
+                                         bool use_block_based_builder) {
+  return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/bloom_test.cc b/src/rocksdb/util/bloom_test.cc
new file mode 100644
index 00000000..4b25e9b6
--- /dev/null
+++ b/src/rocksdb/util/bloom_test.cc
@@ -0,0 +1,317 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <vector>
+
+#include "rocksdb/filter_policy.h"
+#include "table/full_filter_bits_builder.h"
+#include "util/arena.h"
+#include "util/gflags_compat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(bits_per_key, 10, "");
+
+namespace rocksdb {
+
+static const int kVerbose = 1;
+
+static Slice Key(int i, char* buffer) {
+  std::string s;
+  PutFixed32(&s, static_cast<uint32_t>(i));
+  memcpy(buffer, s.c_str(), sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+class BloomTest : public testing::Test {
+ private:
+  const FilterPolicy* policy_;
+  std::string filter_;
+  std::vector<std::string> keys_;
+
+ public:
+  BloomTest() : policy_(
+      NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
+
+  ~BloomTest() override { delete policy_; }
+
+  void Reset() {
+    keys_.clear();
+    filter_.clear();
+  }
+
+  void Add(const Slice& s) {
+    keys_.push_back(s.ToString());
+  }
+
+  void Build() {
+    std::vector<Slice> key_slices;
+    for (size_t i = 0; i < keys_.size(); i++) {
+      key_slices.push_back(Slice(keys_[i]));
+    }
+    filter_.clear();
+    policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()),
+                          &filter_);
+    keys_.clear();
+    if (kVerbose >= 2) DumpFilter();
+  }
+
+  size_t FilterSize() const {
+    return filter_.size();
+  }
+
+  void DumpFilter() {
+    fprintf(stderr, "F(");
+    for (size_t i = 0; i+1 < filter_.size(); i++) {
+      const unsigned int c = static_cast<unsigned int>(filter_[i]);
+      for (int j = 0; j < 8; j++) {
+        fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
+      }
+    }
+    fprintf(stderr, ")\n");
+  }
+
+  bool Matches(const Slice& s) {
+    if (!keys_.empty()) {
+      Build();
+    }
+    return policy_->KeyMayMatch(s, filter_);
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST_F(BloomTest, EmptyFilter) {
+  ASSERT_TRUE(! Matches("hello"));
+  ASSERT_TRUE(! Matches("world"));
+}
+
+TEST_F(BloomTest, Small) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(! Matches("x"));
+  ASSERT_TRUE(! Matches("foo"));
+}
+
+TEST_F(BloomTest, VaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate*100.0, length, static_cast<int>(FilterSize()));
+    }
+    ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125) mediocre_filters++;  // Allowed, but not too often
+    else good_filters++;
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+  }
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+// Different bits-per-byte
+
+class FullBloomTest : public testing::Test {
+ private:
+  const FilterPolicy* policy_;
+  std::unique_ptr<FilterBitsBuilder> bits_builder_;
+  std::unique_ptr<FilterBitsReader> bits_reader_;
+  std::unique_ptr<const char[]> buf_;
+  size_t filter_size_;
+
+ public:
+  FullBloomTest() :
+      policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)),
+      filter_size_(0) {
+    Reset();
+  }
+
+  ~FullBloomTest() override { delete policy_; }
+
+  FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
+    return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
+  }
+
+  void Reset() {
+    bits_builder_.reset(policy_->GetFilterBitsBuilder());
+    bits_reader_.reset(nullptr);
+    buf_.reset(nullptr);
+    filter_size_ = 0;
+  }
+
+  void Add(const Slice& s) {
+    bits_builder_->AddKey(s);
+  }
+
+  void Build() {
+    Slice filter = bits_builder_->Finish(&buf_);
+    bits_reader_.reset(policy_->GetFilterBitsReader(filter));
+    filter_size_ = filter.size();
+  }
+
+  size_t FilterSize() const {
+    return filter_size_;
+  }
+
+  bool Matches(const Slice& s) {
+    if (bits_reader_ == nullptr) {
+      Build();
+    }
+    return bits_reader_->MayMatch(s);
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST_F(FullBloomTest, FilterSize) {
+  uint32_t dont_care1, dont_care2;
+  auto full_bits_builder = GetFullFilterBitsBuilder();
+  for (int n = 1; n < 100; n++) {
+    auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2);
+    auto n2 = full_bits_builder->CalculateNumEntry(space);
+    ASSERT_GE(n2, n);
+    auto space2 =
+        full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2);
+    ASSERT_EQ(space, space2);
+  }
+}
+
+TEST_F(FullBloomTest, FullEmptyFilter) {
+  // Empty filter is not match, at this level
+  ASSERT_TRUE(!Matches("hello"));
+  ASSERT_TRUE(!Matches("world"));
+}
+
+TEST_F(FullBloomTest, FullSmall) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(!Matches("x"));
+  ASSERT_TRUE(!Matches("foo"));
+}
+
+TEST_F(FullBloomTest, FullVaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length;
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate*100.0, length, static_cast<int>(FilterSize()));
+    }
+    ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125)
+      mediocre_filters++;  // Allowed, but not too often
+    else
+      good_filters++;
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+  }
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/build_version.cc.in b/src/rocksdb/util/build_version.cc.in
new file mode 100644
index 00000000..d2e8c578
--- /dev/null
+++ b/src/rocksdb/util/build_version.cc.in
@@ -0,0 +1,4 @@
+#include "build_version.h"
+const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:@@GIT_SHA@@";
+const char* rocksdb_build_git_date = "rocksdb_build_git_date:@@GIT_DATE_TIME@@";
+const char* rocksdb_build_compile_date = __DATE__;
diff --git a/src/rocksdb/util/build_version.h b/src/rocksdb/util/build_version.h
new file mode 100644
index 00000000..36ff92c0
--- /dev/null
+++ b/src/rocksdb/util/build_version.h
@@ -0,0 +1,15 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#if !defined(IOS_CROSS_COMPILE)
+// if we compile with Xcode, we don't run build_detect_version, so we don't
+// generate these variables
+// this variable tells us about the git revision
+extern const char* rocksdb_build_git_sha;
+
+// Date on which the code was compiled:
+extern const char* rocksdb_build_compile_date;
+#endif
diff --git a/src/rocksdb/util/cast_util.h b/src/rocksdb/util/cast_util.h
new file mode 100644
index 00000000..2dc8138a
--- /dev/null
+++ b/src/rocksdb/util/cast_util.h
@@ -0,0 +1,21 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace rocksdb {
+// The helper function to assert the move from dynamic_cast<> to
+// static_cast<> is correct. This function is to deal with legacy code.
+// It is not recommanded to add new code to issue class casting. The preferred
+// solution is to implement the functionality without a need of casting.
+template <class DestClass, class SrcClass>
+inline DestClass* static_cast_with_check(SrcClass* x) {
+  DestClass* ret = static_cast<DestClass*>(x);
+#ifdef ROCKSDB_USE_RTTI
+  assert(ret == dynamic_cast<DestClass*>(x));
+#endif
+  return ret;
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/channel.h b/src/rocksdb/util/channel.h
new file mode 100644
index 00000000..0225482c
--- /dev/null
+++ b/src/rocksdb/util/channel.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <utility>
+
+namespace rocksdb {
+
+template <class T>
+class channel {
+ public:
+  explicit channel() : eof_(false) {}
+
+  channel(const channel&) = delete;
+  void operator=(const channel&) = delete;
+
+  void sendEof() {
+    std::lock_guard<std::mutex> lk(lock_);
+    eof_ = true;
+    cv_.notify_all();
+  }
+
+  bool eof() {
+    std::lock_guard<std::mutex> lk(lock_);
+    return buffer_.empty() && eof_;
+  }
+
+  size_t size() const {
+    std::lock_guard<std::mutex> lk(lock_);
+    return buffer_.size();
+  }
+
+  // writes elem to the queue
+  void write(T&& elem) {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.emplace(std::forward<T>(elem));
+    cv_.notify_one();
+  }
+
+  /// Moves a dequeued element onto elem, blocking until an element
+  /// is available.
+  // returns false if EOF
+  bool read(T& elem) {
+    std::unique_lock<std::mutex> lk(lock_);
+    cv_.wait(lk, [&] { return eof_ || !buffer_.empty(); });
+    if (eof_ && buffer_.empty()) {
+      return false;
+    }
+    elem = std::move(buffer_.front());
+    buffer_.pop();
+    cv_.notify_one();
+    return true;
+  }
+
+ private:
+  std::condition_variable cv_;
+  std::mutex lock_;
+  std::queue<T> buffer_;
+  bool eof_;
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/coding.cc b/src/rocksdb/util/coding.cc
new file mode 100644
index 00000000..b5cfac86
--- /dev/null
+++ b/src/rocksdb/util/coding.cc
@@ -0,0 +1,89 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include <algorithm>
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+
+namespace rocksdb {
+
+// conversion' conversion from 'type1' to 'type2', possible loss of data
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4244)
+#endif
+char* EncodeVarint32(char* dst, uint32_t v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  static const int B = 128;
+  if (v < (1 << 7)) {
+    *(ptr++) = v;
+  } else if (v < (1 << 14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+const char* GetVarint32PtrFallback(const char* p, const char* limit,
+                                   uint32_t* value) {
+  uint32_t result = 0;
+  for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
+    uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
+  uint64_t result = 0;
+  for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
+    uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/coding.h b/src/rocksdb/util/coding.h
new file mode 100644
index 00000000..4046a2b6
--- /dev/null
+++ b/src/rocksdb/util/coding.h
@@ -0,0 +1,455 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#pragma once
+#include <algorithm>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+
+#include "rocksdb/write_batch.h"
+#include "port/port.h"
+
+// Some processors does not allow unaligned access to memory
+#if defined(__sparc)
+  #define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED
+#endif
+
+namespace rocksdb {
+
+// The maximum length of a varint in bytes for 64-bit.
+const unsigned int kMaxVarint64Length = 10;
+
+// Standard Put... routines append to a string
+extern void PutFixed16(std::string* dst, uint16_t value);
+extern void PutFixed32(std::string* dst, uint32_t value);
+extern void PutFixed64(std::string* dst, uint64_t value);
+extern void PutVarint32(std::string* dst, uint32_t value);
+extern void PutVarint32Varint32(std::string* dst, uint32_t value1,
+                                uint32_t value2);
+extern void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1,
+                                        uint32_t value2, uint32_t value3);
+extern void PutVarint64(std::string* dst, uint64_t value);
+extern void PutVarint64Varint64(std::string* dst, uint64_t value1,
+                                uint64_t value2);
+extern void PutVarint32Varint64(std::string* dst, uint32_t value1,
+                                uint64_t value2);
+extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1,
+                                        uint32_t value2, uint64_t value3);
+extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+extern void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts);
+
+// Standard Get... routines parse a value from the beginning of a Slice
+// and advance the slice past the parsed value.
+extern bool GetFixed64(Slice* input, uint64_t* value);
+extern bool GetFixed32(Slice* input, uint32_t* value);
+extern bool GetFixed16(Slice* input, uint16_t* value);
+extern bool GetVarint32(Slice* input, uint32_t* value);
+extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+// This function assumes data is well-formed.
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+extern Slice GetSliceUntil(Slice* slice, char delimiter);
+
+// Borrowed from
+// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208
+constexpr inline uint64_t i64ToZigzag(const int64_t l) {
+  return (static_cast<uint64_t>(l) << 1) ^ static_cast<uint64_t>(l >> 63);
+}
+inline int64_t zigzagToI64(uint64_t n) {
+  return (n >> 1) ^ -static_cast<int64_t>(n & 1);
+}
+
+// Pointer-based variants of GetVarint...  These either store a value
+// in *v and return a pointer just past the parsed value, or return
+// nullptr on error.  These routines only look at bytes in the range
+// [p..limit-1]
+extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
+extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
+                                        int64_t* value) {
+  uint64_t u = 0;
+  const char* ret = GetVarint64Ptr(p, limit, &u);
+  *value = zigzagToI64(u);
+  return ret;
+}
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed16(char* dst, uint16_t value);
+extern void EncodeFixed32(char* dst, uint32_t value);
+extern void EncodeFixed64(char* dst, uint64_t value);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
+// REQUIRES: dst has enough space for the value being written
+extern char* EncodeVarint32(char* dst, uint32_t value);
+extern char* EncodeVarint64(char* dst, uint64_t value);
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint16_t DecodeFixed16(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint16_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
+  }
+}
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint32_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
+        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint64_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64_t lo = DecodeFixed32(ptr);
+    uint64_t hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p,
+                                          const char* limit,
+                                          uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p,
+                                  const char* limit,
+                                  uint32_t* value) {
+  if (p < limit) {
+    uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+    if ((result & 128) == 0) {
+      *value = result;
+      return p + 1;
+    }
+  }
+  return GetVarint32PtrFallback(p, limit, value);
+}
+
+// -- Implementation of the functions declared above
+inline void EncodeFixed16(char* buf, uint16_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+  }
+}
+
+inline void EncodeFixed32(char* buf, uint32_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+  }
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+    buf[4] = (value >> 32) & 0xff;
+    buf[5] = (value >> 40) & 0xff;
+    buf[6] = (value >> 48) & 0xff;
+    buf[7] = (value >> 56) & 0xff;
+  }
+}
+
+// Pull the last 8 bits and cast it to a character
+inline void PutFixed16(std::string* dst, uint16_t value) {
+  if (port::kLittleEndian) {
+    dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+                sizeof(value));
+  } else {
+    char buf[sizeof(value)];
+    EncodeFixed16(buf, value);
+    dst->append(buf, sizeof(buf));
+  }
+}
+
+inline void PutFixed32(std::string* dst, uint32_t value) {
+  if (port::kLittleEndian) {
+    dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+      sizeof(value));
+  } else {
+    char buf[sizeof(value)];
+    EncodeFixed32(buf, value);
+    dst->append(buf, sizeof(buf));
+  }
+}
+
+inline void PutFixed64(std::string* dst, uint64_t value) {
+  if (port::kLittleEndian) {
+    dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+      sizeof(value));
+  } else {
+    char buf[sizeof(value)];
+    EncodeFixed64(buf, value);
+    dst->append(buf, sizeof(buf));
+  }
+}
+
+inline void PutVarint32(std::string* dst, uint32_t v) {
+  char buf[5];
+  char* ptr = EncodeVarint32(buf, v);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32(std::string* dst, uint32_t v1, uint32_t v2) {
+  char buf[10];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint32(ptr, v2);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1,
+                                        uint32_t v2, uint32_t v3) {
+  char buf[15];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint32(ptr, v2);
+  ptr = EncodeVarint32(ptr, v3);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline char* EncodeVarint64(char* dst, uint64_t v) {
+  static const unsigned int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+inline void PutVarint64(std::string* dst, uint64_t v) {
+  char buf[kMaxVarint64Length];
+  char* ptr = EncodeVarint64(buf, v);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarsignedint64(std::string* dst, int64_t v) {
+  char buf[kMaxVarint64Length];
+  // Using Zigzag format to convert signed to unsigned
+  char* ptr = EncodeVarint64(buf, i64ToZigzag(v));
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) {
+  char buf[20];
+  char* ptr = EncodeVarint64(buf, v1);
+  ptr = EncodeVarint64(ptr, v2);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint64(std::string* dst, uint32_t v1, uint64_t v2) {
+  char buf[15];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint64(ptr, v2);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32Varint64(std::string* dst, uint32_t v1,
+                                        uint32_t v2, uint64_t v3) {
+  char buf[20];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint32(ptr, v2);
+  ptr = EncodeVarint64(ptr, v3);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+  PutVarint32(dst, static_cast<uint32_t>(value.size()));
+  dst->append(value.data(), value.size());
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts) {
+  size_t total_bytes = 0;
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    total_bytes += slice_parts.parts[i].size();
+  }
+  PutVarint32(dst, static_cast<uint32_t>(total_bytes));
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+  }
+}
+
+inline int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+inline bool GetFixed64(Slice* input, uint64_t* value) {
+  if (input->size() < sizeof(uint64_t)) {
+    return false;
+  }
+  *value = DecodeFixed64(input->data());
+  input->remove_prefix(sizeof(uint64_t));
+  return true;
+}
+
+inline bool GetFixed32(Slice* input, uint32_t* value) {
+  if (input->size() < sizeof(uint32_t)) {
+    return false;
+  }
+  *value = DecodeFixed32(input->data());
+  input->remove_prefix(sizeof(uint32_t));
+  return true;
+}
+
+inline bool GetFixed16(Slice* input, uint16_t* value) {
+  if (input->size() < sizeof(uint16_t)) {
+    return false;
+  }
+  *value = DecodeFixed16(input->data());
+  input->remove_prefix(sizeof(uint16_t));
+  return true;
+}
+
+inline bool GetVarint32(Slice* input, uint32_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint32Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, static_cast<size_t>(limit - q));
+    return true;
+  }
+}
+
+inline bool GetVarint64(Slice* input, uint64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, static_cast<size_t>(limit - q));
+    return true;
+  }
+}
+
+// Provide an interface for platform independent endianness transformation
+inline uint64_t EndianTransform(uint64_t input, size_t size) {
+  char* pos = reinterpret_cast<char*>(&input);
+  uint64_t ret_val = 0;
+  for (size_t i = 0; i < size; ++i) {
+    ret_val |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
+                << ((size - i - 1) << 3));
+  }
+  return ret_val;
+}
+
+inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+  uint32_t len = 0;
+  if (GetVarint32(input, &len) && input->size() >= len) {
+    *result = Slice(input->data(), len);
+    input->remove_prefix(len);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len = 0;
+  // +5: we assume "data" is not corrupted
+  // unsigned char is 7 bits, uint32_t is 32 bits, need 5 unsigned char
+  auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
+  return Slice(p, len);
+}
+
+inline Slice GetSliceUntil(Slice* slice, char delimiter) {
+  uint32_t len = 0;
+  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+    // nothing
+  }
+
+  Slice ret(slice->data(), len);
+  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+  return ret;
+}
+
+template<class T>
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void PutUnaligned(T *memory, const T &value) {
+#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
+  char *nonAlignedMemory = reinterpret_cast<char*>(memory);
+  memcpy(nonAlignedMemory, reinterpret_cast<const char*>(&value), sizeof(T));
+#else
+  *memory = value;
+#endif
+}
+
+template<class T>
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void GetUnaligned(const T *memory, T *value) {
+#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
+  char *nonAlignedMemory = reinterpret_cast<char*>(value);
+  memcpy(nonAlignedMemory, reinterpret_cast<const char*>(memory), sizeof(T));
+#else
+  *value = *memory;
+#endif
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/coding_test.cc b/src/rocksdb/util/coding_test.cc
new file mode 100644
index 00000000..f7b1671d
--- /dev/null
+++ b/src/rocksdb/util/coding_test.cc
@@ -0,0 +1,217 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class Coding { };
+TEST(Coding, Fixed16) {
+  std::string s;
+  for (uint16_t v = 0; v < 0xFFFF; v++) {
+    PutFixed16(&s, v);
+  }
+
+  const char* p = s.data();
+  for (uint16_t v = 0; v < 0xFFFF; v++) {
+    uint16_t actual = DecodeFixed16(p);
+    ASSERT_EQ(v, actual);
+    p += sizeof(uint16_t);
+  }
+}
+
+TEST(Coding, Fixed32) {
+  std::string s;
+  for (uint32_t v = 0; v < 100000; v++) {
+    PutFixed32(&s, v);
+  }
+
+  const char* p = s.data();
+  for (uint32_t v = 0; v < 100000; v++) {
+    uint32_t actual = DecodeFixed32(p);
+    ASSERT_EQ(v, actual);
+    p += sizeof(uint32_t);
+  }
+}
+
+TEST(Coding, Fixed64) {
+  std::string s;
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    PutFixed64(&s, v - 1);
+    PutFixed64(&s, v + 0);
+    PutFixed64(&s, v + 1);
+  }
+
+  const char* p = s.data();
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    uint64_t actual = 0;
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v-1, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v+0, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v+1, actual);
+    p += sizeof(uint64_t);
+  }
+}
+
+// Test that encoding routines generate little-endian encodings
+TEST(Coding, EncodingOutput) {
+  std::string dst;
+  PutFixed32(&dst, 0x04030201);
+  ASSERT_EQ(4U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+
+  dst.clear();
+  PutFixed64(&dst, 0x0807060504030201ull);
+  ASSERT_EQ(8U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+  ASSERT_EQ(0x05, static_cast<int>(dst[4]));
+  ASSERT_EQ(0x06, static_cast<int>(dst[5]));
+  ASSERT_EQ(0x07, static_cast<int>(dst[6]));
+  ASSERT_EQ(0x08, static_cast<int>(dst[7]));
+}
+
+TEST(Coding, Varint32) {
+  std::string s;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t v = (i / 32) << (i % 32);
+    PutVarint32(&s, v);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t expected = (i / 32) << (i % 32);
+    uint32_t actual = 0;
+    const char* start = p;
+    p = GetVarint32Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(expected, actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, s.data() + s.size());
+}
+
+TEST(Coding, Varint64) {
+  // Construct the list of values to check
+  std::vector<uint64_t> values;
+  // Some special values
+  values.push_back(0);
+  values.push_back(100);
+  values.push_back(~static_cast<uint64_t>(0));
+  values.push_back(~static_cast<uint64_t>(0) - 1);
+  for (uint32_t k = 0; k < 64; k++) {
+    // Test values near powers of two
+    const uint64_t power = 1ull << k;
+    values.push_back(power);
+    values.push_back(power-1);
+    values.push_back(power+1);
+  };
+
+  std::string s;
+  for (unsigned int i = 0; i < values.size(); i++) {
+    PutVarint64(&s, values[i]);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (unsigned int i = 0; i < values.size(); i++) {
+    ASSERT_TRUE(p < limit);
+    uint64_t actual = 0;
+    const char* start = p;
+    p = GetVarint64Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(values[i], actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, limit);
+
+}
+
+TEST(Coding, Varint32Overflow) {
+  uint32_t result;
+  std::string input("\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result)
+              == nullptr);
+}
+
+TEST(Coding, Varint32Truncation) {
+  uint32_t large_value = (1u << 31) + 100;
+  std::string s;
+  PutVarint32(&s, large_value);
+  uint32_t result;
+  for (unsigned int len = 0; len < s.size() - 1; len++) {
+    ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(
+      GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Varint64Overflow) {
+  uint64_t result;
+  std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result)
+              == nullptr);
+}
+
+TEST(Coding, Varint64Truncation) {
+  uint64_t large_value = (1ull << 63) + 100ull;
+  std::string s;
+  PutVarint64(&s, large_value);
+  uint64_t result;
+  for (unsigned int len = 0; len < s.size() - 1; len++) {
+    ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(
+      GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Strings) {
+  std::string s;
+  PutLengthPrefixedSlice(&s, Slice(""));
+  PutLengthPrefixedSlice(&s, Slice("foo"));
+  PutLengthPrefixedSlice(&s, Slice("bar"));
+  PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x')));
+
+  Slice input(s);
+  Slice v;
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("foo", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("bar", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ(std::string(200, 'x'), v.ToString());
+  ASSERT_EQ("", input.ToString());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/compaction_job_stats_impl.cc b/src/rocksdb/util/compaction_job_stats_impl.cc
new file mode 100644
index 00000000..a1ebc8b9
--- /dev/null
+++ b/src/rocksdb/util/compaction_job_stats_impl.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/compaction_job_stats.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+void CompactionJobStats::Reset() {
+  elapsed_micros = 0;
+  cpu_micros = 0;
+
+  num_input_records = 0;
+  num_input_files = 0;
+  num_input_files_at_output_level = 0;
+
+  num_output_records = 0;
+  num_output_files = 0;
+
+  is_manual_compaction = 0;
+
+  total_input_bytes = 0;
+  total_output_bytes = 0;
+
+  num_records_replaced = 0;
+
+  total_input_raw_key_bytes = 0;
+  total_input_raw_value_bytes = 0;
+
+  num_input_deletion_records = 0;
+  num_expired_deletion_records = 0;
+
+  num_corrupt_keys = 0;
+
+  file_write_nanos = 0;
+  file_range_sync_nanos = 0;
+  file_fsync_nanos = 0;
+  file_prepare_write_nanos = 0;
+
+  num_single_del_fallthru = 0;
+  num_single_del_mismatch = 0;
+}
+
+void CompactionJobStats::Add(const CompactionJobStats& stats) {
+  elapsed_micros += stats.elapsed_micros;
+  cpu_micros += stats.cpu_micros;
+
+  num_input_records += stats.num_input_records;
+  num_input_files += stats.num_input_files;
+  num_input_files_at_output_level += stats.num_input_files_at_output_level;
+
+  num_output_records += stats.num_output_records;
+  num_output_files += stats.num_output_files;
+
+  total_input_bytes += stats.total_input_bytes;
+  total_output_bytes += stats.total_output_bytes;
+
+  num_records_replaced += stats.num_records_replaced;
+
+  total_input_raw_key_bytes += stats.total_input_raw_key_bytes;
+  total_input_raw_value_bytes += stats.total_input_raw_value_bytes;
+
+  num_input_deletion_records += stats.num_input_deletion_records;
+  num_expired_deletion_records += stats.num_expired_deletion_records;
+
+  num_corrupt_keys += stats.num_corrupt_keys;
+
+  file_write_nanos += stats.file_write_nanos;
+  file_range_sync_nanos += stats.file_range_sync_nanos;
+  file_fsync_nanos += stats.file_fsync_nanos;
+  file_prepare_write_nanos += stats.file_prepare_write_nanos;
+
+  num_single_del_fallthru += stats.num_single_del_fallthru;
+  num_single_del_mismatch += stats.num_single_del_mismatch;
+}
+
+#else
+
+void CompactionJobStats::Reset() {}
+
+void CompactionJobStats::Add(const CompactionJobStats& /*stats*/) {}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/comparator.cc b/src/rocksdb/util/comparator.cc
new file mode 100644
index 00000000..b42c2372
--- /dev/null
+++ b/src/rocksdb/util/comparator.cc
@@ -0,0 +1,208 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "port/port.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+namespace {
+class BytewiseComparatorImpl : public Comparator {
+ public:
+  BytewiseComparatorImpl() { }
+
+  const char* Name() const override { return "leveldb.BytewiseComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return a.compare(b);
+  }
+
+  bool Equal(const Slice& a, const Slice& b) const override { return a == b; }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    // Find length of common prefix
+    size_t min_length = std::min(start->size(), limit.size());
+    size_t diff_index = 0;
+    while ((diff_index < min_length) &&
+           ((*start)[diff_index] == limit[diff_index])) {
+      diff_index++;
+    }
+
+    if (diff_index >= min_length) {
+      // Do not shorten if one string is a prefix of the other
+    } else {
+      uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]);
+      uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]);
+      if (start_byte >= limit_byte) {
+        // Cannot shorten since limit is smaller than start or start is
+        // already the shortest possible.
+        return;
+      }
+      assert(start_byte < limit_byte);
+
+      if (diff_index < limit.size() - 1 || start_byte + 1 < limit_byte) {
+        (*start)[diff_index]++;
+        start->resize(diff_index + 1);
+      } else {
+        //     v
+        // A A 1 A A A
+        // A A 2
+        //
+        // Incrementing the current byte will make start bigger than limit, we
+        // will skip this byte, and find the first non 0xFF byte in start and
+        // increment it.
+        diff_index++;
+
+        while (diff_index < start->size()) {
+          // Keep moving until we find the first non 0xFF byte to
+          // increment it
+          if (static_cast<uint8_t>((*start)[diff_index]) <
+              static_cast<uint8_t>(0xff)) {
+            (*start)[diff_index]++;
+            start->resize(diff_index + 1);
+            break;
+          }
+          diff_index++;
+        }
+      }
+      assert(Compare(*start, limit) < 0);
+    }
+  }
+
+  void FindShortSuccessor(std::string* key) const override {
+    // Find first character that can be incremented
+    size_t n = key->size();
+    for (size_t i = 0; i < n; i++) {
+      const uint8_t byte = (*key)[i];
+      if (byte != static_cast<uint8_t>(0xff)) {
+        (*key)[i] = byte + 1;
+        key->resize(i+1);
+        return;
+      }
+    }
+    // *key is a run of 0xffs.  Leave it alone.
+  }
+
+  bool IsSameLengthImmediateSuccessor(const Slice& s,
+                                      const Slice& t) const override {
+    if (s.size() != t.size() || s.size() == 0) {
+      return false;
+    }
+    size_t diff_ind = s.difference_offset(t);
+    // same slice
+    if (diff_ind >= s.size()) return false;
+    uint8_t byte_s = static_cast<uint8_t>(s[diff_ind]);
+    uint8_t byte_t = static_cast<uint8_t>(t[diff_ind]);
+    // first different byte must be consecutive, and remaining bytes must be
+    // 0xff for s and 0x00 for t
+    if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) {
+      for (size_t i = diff_ind + 1; i < s.size(); ++i) {
+        byte_s = static_cast<uint8_t>(s[i]);
+        byte_t = static_cast<uint8_t>(t[i]);
+        if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
+};
+
+class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
+ public:
+  ReverseBytewiseComparatorImpl() { }
+
+  const char* Name() const override {
+    return "rocksdb.ReverseBytewiseComparator";
+  }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return -a.compare(b);
+  }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    // Find length of common prefix
+    size_t min_length = std::min(start->size(), limit.size());
+    size_t diff_index = 0;
+    while ((diff_index < min_length) &&
+           ((*start)[diff_index] == limit[diff_index])) {
+      diff_index++;
+    }
+
+    assert(diff_index <= min_length);
+    if (diff_index == min_length) {
+      // Do not shorten if one string is a prefix of the other
+      //
+      // We could handle cases like:
+      //     V
+      // A A 2 X Y
+      // A A 2
+      // in a similar way as BytewiseComparator::FindShortestSeparator().
+      // We keep it simple by not implementing it. We can come back to it
+      // later when needed.
+    } else {
+      uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]);
+      uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]);
+      if (start_byte > limit_byte && diff_index < start->size() - 1) {
+        // Case like
+        //     V
+        // A A 3 A A
+        // A A 1 B B
+        //
+        // or
+        //     v
+        // A A 2 A A
+        // A A 1 B B
+        // In this case "AA2" will be good.
+#ifndef NDEBUG
+        std::string old_start = *start;
+#endif
+        start->resize(diff_index + 1);
+#ifndef NDEBUG
+        assert(old_start >= *start);
+#endif
+        assert(Slice(*start).compare(limit) > 0);
+      }
+    }
+  }
+
+  void FindShortSuccessor(std::string* /*key*/) const override {
+    // Don't do anything for simplicity.
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
+};
+}// namespace
+
+const Comparator* BytewiseComparator() {
+  static BytewiseComparatorImpl bytewise;
+  return &bytewise;
+}
+
+const Comparator* ReverseBytewiseComparator() {
+  static ReverseBytewiseComparatorImpl rbytewise;
+  return &rbytewise;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/compression.h b/src/rocksdb/util/compression.h
new file mode 100644
index 00000000..b901ceb3
--- /dev/null
+++ b/src/rocksdb/util/compression.h
@@ -0,0 +1,1347 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else  // OS_FREEBSD
+#include <malloc.h>
+#endif  // OS_FREEBSD
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/memory_allocator.h"
+#include "util/string_util.h"
+
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#if defined(LZ4)
+#include <lz4.h>
+#include <lz4hc.h>
+#endif
+
+#if defined(ZSTD)
+#include <zstd.h>
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+#include <zdict.h>
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+namespace rocksdb {
+// Need this for the context allocation override
+// On windows we need to do this explicitly
+#if (ZSTD_VERSION_NUMBER >= 500)
+#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \
+    defined(ZSTD_STATIC_LINKING_ONLY)
+#define ROCKSDB_ZSTD_CUSTOM_MEM
+namespace port {
+ZSTD_customMem GetJeZstdAllocationOverrides();
+}  // namespace port
+#endif  // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) &&
+        // defined(ZSTD_STATIC_LINKING_ONLY)
+
+// We require `ZSTD_sizeof_DDict` and `ZSTD_createDDict_byReference` to use
+// `ZSTD_DDict`. The former was introduced in v1.0.0 and the latter was
+// introduced in v1.1.3. But an important bug fix for `ZSTD_sizeof_DDict` came
+// in v1.1.4, so that is the version we require. As of today's latest version
+// (v1.3.8), they are both still in the experimental API, which means they are
+// only exported when the compiler flag `ZSTD_STATIC_LINKING_ONLY` is set.
+#if defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+#define ROCKSDB_ZSTD_DDICT
+#endif  // defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+
+// Cached data represents a portion that can be re-used
+// If, in the future we have more than one native context to
+// cache we can arrange this as a tuple
+class ZSTDUncompressCachedData {
+ public:
+  using ZSTDNativeContext = ZSTD_DCtx*;
+  ZSTDUncompressCachedData() {}
+  // Init from cache
+  ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete;
+  ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+  ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) ROCKSDB_NOEXCEPT
+      : ZSTDUncompressCachedData() {
+    *this = std::move(o);
+  }
+  ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o)
+      ROCKSDB_NOEXCEPT {
+    assert(zstd_ctx_ == nullptr);
+    std::swap(zstd_ctx_, o.zstd_ctx_);
+    std::swap(cache_idx_, o.cache_idx_);
+    return *this;
+  }
+  ZSTDNativeContext Get() const { return zstd_ctx_; }
+  int64_t GetCacheIndex() const { return cache_idx_; }
+  void CreateIfNeeded() {
+    if (zstd_ctx_ == nullptr) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ =
+          ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ = ZSTD_createDCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+      cache_idx_ = -1;
+    }
+  }
+  void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) {
+    zstd_ctx_ = o.zstd_ctx_;
+    cache_idx_ = idx;
+  }
+  ~ZSTDUncompressCachedData() {
+    if (zstd_ctx_ != nullptr && cache_idx_ == -1) {
+      ZSTD_freeDCtx(zstd_ctx_);
+    }
+  }
+
+ private:
+  ZSTDNativeContext zstd_ctx_ = nullptr;
+  int64_t cache_idx_ = -1;  // -1 means this instance owns the context
+};
+#endif  // (ZSTD_VERSION_NUMBER >= 500)
+}  // namespace rocksdb
+#endif  // ZSTD
+
+#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500)
+namespace rocksdb {
+class ZSTDUncompressCachedData {
+  void* padding;  // unused
+ public:
+  using ZSTDNativeContext = void*;
+  ZSTDUncompressCachedData() {}
+  ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {}
+  ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+  ZSTDUncompressCachedData(ZSTDUncompressCachedData&&)
+      ROCKSDB_NOEXCEPT = default;
+  ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&)
+      ROCKSDB_NOEXCEPT = default;
+  ZSTDNativeContext Get() const { return nullptr; }
+  int64_t GetCacheIndex() const { return -1; }
+  void CreateIfNeeded() {}
+  void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {}
+ private:
+  void ignore_padding__() { padding = nullptr; }
+};
+}  // namespace rocksdb
+#endif
+
+#if defined(XPRESS)
+#include "port/xpress.h"
+#endif
+
+namespace rocksdb {
+
+// Holds dictionary and related data, like ZSTD's digested compression
+// dictionary.
+struct CompressionDict {
+#if ZSTD_VERSION_NUMBER >= 700
+  ZSTD_CDict* zstd_cdict_ = nullptr;
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  std::string dict_;
+
+ public:
+#if ZSTD_VERSION_NUMBER >= 700
+  CompressionDict(std::string dict, CompressionType type, int level) {
+#else   // ZSTD_VERSION_NUMBER >= 700
+  CompressionDict(std::string dict, CompressionType /*type*/, int /*level*/) {
+#endif  // ZSTD_VERSION_NUMBER >= 700
+    dict_ = std::move(dict);
+#if ZSTD_VERSION_NUMBER >= 700
+    zstd_cdict_ = nullptr;
+    if (!dict_.empty() && (type == kZSTD || type == kZSTDNotFinalCompression)) {
+      if (level == CompressionOptions::kDefaultCompressionLevel) {
+        // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+        // https://github.com/facebook/zstd/issues/1148
+        level = 3;
+      }
+      // Should be safe (but slower) if below call fails as we'll use the
+      // raw dictionary to compress.
+      zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level);
+      assert(zstd_cdict_ != nullptr);
+    }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  }
+
+  ~CompressionDict() {
+#if ZSTD_VERSION_NUMBER >= 700
+    size_t res = 0;
+    if (zstd_cdict_ != nullptr) {
+      res = ZSTD_freeCDict(zstd_cdict_);
+    }
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ZSTD_VERSION_NUMBER >= 700
+  }
+
+#if ZSTD_VERSION_NUMBER >= 700
+  const ZSTD_CDict* GetDigestedZstdCDict() const { return zstd_cdict_; }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+
+  Slice GetRawDict() const { return dict_; }
+
+  static const CompressionDict& GetEmptyDict() {
+    static CompressionDict empty_dict{};
+    return empty_dict;
+  }
+
+  CompressionDict() = default;
+  // Disable copy/move
+  CompressionDict(const CompressionDict&) = delete;
+  CompressionDict& operator=(const CompressionDict&) = delete;
+  CompressionDict(CompressionDict&&) = delete;
+  CompressionDict& operator=(CompressionDict&&) = delete;
+};
+
+// Holds dictionary and related data, like ZSTD's digested uncompression
+// dictionary.
+struct UncompressionDict {
+#ifdef ROCKSDB_ZSTD_DDICT
+  ZSTD_DDict* zstd_ddict_;
+#endif  // ROCKSDB_ZSTD_DDICT
+  // Block containing the data for the compression dictionary. It may be
+  // redundant with the data held in `zstd_ddict_`.
+  std::string dict_;
+  // This `Statistics` pointer is intended to be used upon block cache eviction,
+  // so only needs to be populated on `UncompressionDict`s that'll be inserted
+  // into block cache.
+  Statistics* statistics_;
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  UncompressionDict(std::string dict, bool using_zstd,
+                    Statistics* _statistics = nullptr) {
+#else   // ROCKSDB_ZSTD_DDICT
+  UncompressionDict(std::string dict, bool /*using_zstd*/,
+                    Statistics* _statistics = nullptr) {
+#endif  // ROCKSDB_ZSTD_DDICT
+    dict_ = std::move(dict);
+    statistics_ = _statistics;
+#ifdef ROCKSDB_ZSTD_DDICT
+    zstd_ddict_ = nullptr;
+    if (!dict_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(dict_.data(), dict_.size());
+      assert(zstd_ddict_ != nullptr);
+    }
+#endif  // ROCKSDB_ZSTD_DDICT
+  }
+
+  ~UncompressionDict() {
+#ifdef ROCKSDB_ZSTD_DDICT
+    size_t res = 0;
+    if (zstd_ddict_ != nullptr) {
+      res = ZSTD_freeDDict(zstd_ddict_);
+    }
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ROCKSDB_ZSTD_DDICT
+  }
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
+#endif  // ROCKSDB_ZSTD_DDICT
+
+  Slice GetRawDict() const { return dict_; }
+
+  static const UncompressionDict& GetEmptyDict() {
+    static UncompressionDict empty_dict{};
+    return empty_dict;
+  }
+
+  Statistics* statistics() const { return statistics_; }
+
+  size_t ApproximateMemoryUsage() {
+    size_t usage = 0;
+    usage += sizeof(struct UncompressionDict);
+#ifdef ROCKSDB_ZSTD_DDICT
+    usage += ZSTD_sizeof_DDict(zstd_ddict_);
+#endif  // ROCKSDB_ZSTD_DDICT
+    usage += dict_.size();
+    return usage;
+  }
+
+  UncompressionDict() = default;
+  // Disable copy/move
+  UncompressionDict(const CompressionDict&) = delete;
+  UncompressionDict& operator=(const CompressionDict&) = delete;
+  UncompressionDict(CompressionDict&&) = delete;
+  UncompressionDict& operator=(CompressionDict&&) = delete;
+};
+
+class CompressionContext {
+ private:
+#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500)
+  ZSTD_CCtx* zstd_ctx_ = nullptr;
+  void CreateNativeContext(CompressionType type) {
+    if (type == kZSTD || type == kZSTDNotFinalCompression) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ =
+          ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ = ZSTD_createCCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+    }
+  }
+  void DestroyNativeContext() {
+    if (zstd_ctx_ != nullptr) {
+      ZSTD_freeCCtx(zstd_ctx_);
+    }
+  }
+
+ public:
+  // callable inside ZSTD_Compress
+  ZSTD_CCtx* ZSTDPreallocCtx() const {
+    assert(zstd_ctx_ != nullptr);
+    return zstd_ctx_;
+  }
+
+#else   // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ private:
+  void CreateNativeContext(CompressionType /* type */) {}
+  void DestroyNativeContext() {}
+#endif  // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ public:
+  explicit CompressionContext(CompressionType type) {
+    CreateNativeContext(type);
+  }
+  ~CompressionContext() { DestroyNativeContext(); }
+  CompressionContext(const CompressionContext&) = delete;
+  CompressionContext& operator=(const CompressionContext&) = delete;
+};
+
+class CompressionInfo {
+  const CompressionOptions& opts_;
+  const CompressionContext& context_;
+  const CompressionDict& dict_;
+  const CompressionType type_;
+  const uint64_t sample_for_compression_;
+
+ public:
+  CompressionInfo(const CompressionOptions& _opts,
+                  const CompressionContext& _context,
+                  const CompressionDict& _dict, CompressionType _type,
+                  uint64_t _sample_for_compression)
+      : opts_(_opts),
+        context_(_context),
+        dict_(_dict),
+        type_(_type),
+        sample_for_compression_(_sample_for_compression) {}
+
+  const CompressionOptions& options() const { return opts_; }
+  const CompressionContext& context() const { return context_; }
+  const CompressionDict& dict() const { return dict_; }
+  CompressionType type() const { return type_; }
+  uint64_t SampleForCompression() const { return sample_for_compression_; }
+};
+
+class UncompressionContext {
+ private:
+  CompressionContextCache* ctx_cache_ = nullptr;
+  ZSTDUncompressCachedData uncomp_cached_data_;
+
+ public:
+  struct NoCache {};
+  // Do not use context cache, used by TableBuilder
+  UncompressionContext(NoCache, CompressionType /* type */) {}
+
+  explicit UncompressionContext(CompressionType type) {
+    if (type == kZSTD || type == kZSTDNotFinalCompression) {
+      ctx_cache_ = CompressionContextCache::Instance();
+      uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData();
+    }
+  }
+  ~UncompressionContext() {
+    if (uncomp_cached_data_.GetCacheIndex() != -1) {
+      assert(ctx_cache_ != nullptr);
+      ctx_cache_->ReturnCachedZSTDUncompressData(
+          uncomp_cached_data_.GetCacheIndex());
+    }
+  }
+  UncompressionContext(const UncompressionContext&) = delete;
+  UncompressionContext& operator=(const UncompressionContext&) = delete;
+
+  ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const {
+    return uncomp_cached_data_.Get();
+  }
+};
+
+class UncompressionInfo {
+  const UncompressionContext& context_;
+  const UncompressionDict& dict_;
+  const CompressionType type_;
+
+ public:
+  UncompressionInfo(const UncompressionContext& _context,
+                    const UncompressionDict& _dict, CompressionType _type)
+      : context_(_context), dict_(_dict), type_(_type) {}
+
+  const UncompressionContext& context() const { return context_; }
+  const UncompressionDict& dict() const { return dict_; }
+  CompressionType type() const { return type_; }
+};
+
+inline bool Snappy_Supported() {
+#ifdef SNAPPY
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool Zlib_Supported() {
+#ifdef ZLIB
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool BZip2_Supported() {
+#ifdef BZIP2
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool LZ4_Supported() {
+#ifdef LZ4
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool XPRESS_Supported() {
+#ifdef XPRESS
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool ZSTD_Supported() {
+#ifdef ZSTD
+  // ZSTD format is finalized since version 0.8.0.
+  return (ZSTD_versionNumber() >= 800);
+#else
+  return false;
+#endif
+}
+
+inline bool ZSTDNotFinal_Supported() {
+#ifdef ZSTD
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool CompressionTypeSupported(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return true;
+    case kSnappyCompression:
+      return Snappy_Supported();
+    case kZlibCompression:
+      return Zlib_Supported();
+    case kBZip2Compression:
+      return BZip2_Supported();
+    case kLZ4Compression:
+      return LZ4_Supported();
+    case kLZ4HCCompression:
+      return LZ4_Supported();
+    case kXpressCompression:
+      return XPRESS_Supported();
+    case kZSTDNotFinalCompression:
+      return ZSTDNotFinal_Supported();
+    case kZSTD:
+      return ZSTD_Supported();
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+inline std::string CompressionTypeToString(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return "NoCompression";
+    case kSnappyCompression:
+      return "Snappy";
+    case kZlibCompression:
+      return "Zlib";
+    case kBZip2Compression:
+      return "BZip2";
+    case kLZ4Compression:
+      return "LZ4";
+    case kLZ4HCCompression:
+      return "LZ4HC";
+    case kXpressCompression:
+      return "Xpress";
+    case kZSTD:
+      return "ZSTD";
+    case kZSTDNotFinalCompression:
+      return "ZSTDNotFinal";
+    default:
+      assert(false);
+      return "";
+  }
+}
+
+inline std::string CompressionOptionsToString(
+    CompressionOptions& compression_options) {
+  std::string result;
+  result.reserve(512);
+  result.append("window_bits=")
+      .append(ToString(compression_options.window_bits))
+      .append("; ");
+  result.append("level=")
+      .append(ToString(compression_options.level))
+      .append("; ");
+  result.append("strategy=")
+      .append(ToString(compression_options.strategy))
+      .append("; ");
+  result.append("max_dict_bytes=")
+      .append(ToString(compression_options.max_dict_bytes))
+      .append("; ");
+  result.append("zstd_max_train_bytes=")
+      .append(ToString(compression_options.zstd_max_train_bytes))
+      .append("; ");
+  result.append("enabled=")
+      .append(ToString(compression_options.enabled))
+      .append("; ");
+  return result;
+}
+
+// compress_format_version can have two values:
+// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
+// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
+// way.
+// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
+// start of compressed block. Snappy format is the same as version 1.
+
+inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
+                            size_t length, ::std::string* output) {
+#ifdef SNAPPY
+  output->resize(snappy::MaxCompressedLength(length));
+  size_t outlen;
+  snappy::RawCompress(input, length, &(*output)[0], &outlen);
+  output->resize(outlen);
+  return true;
+#else
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result) {
+#ifdef SNAPPY
+  return snappy::GetUncompressedLength(input, length, result);
+#else
+  (void)input;
+  (void)length;
+  (void)result;
+  return false;
+#endif
+}
+
+inline bool Snappy_Uncompress(const char* input, size_t length, char* output) {
+#ifdef SNAPPY
+  return snappy::RawUncompress(input, length, output);
+#else
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+namespace compression {
+// returns size
+inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
+  PutVarint32(output, length);
+  return output->size();
+}
+
+inline bool GetDecompressedSizeInfo(const char** input_data,
+                                    size_t* input_length,
+                                    uint32_t* output_len) {
+  auto new_input_data =
+      GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
+  if (new_input_data == nullptr) {
+    return false;
+  }
+  *input_length -= (new_input_data - *input_data);
+  *input_data = new_input_data;
+  return true;
+}
+}  // namespace compression
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline bool Zlib_Compress(const CompressionInfo& info,
+                          uint32_t compress_format_version, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZLIB
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(output_header_len + length);
+
+  // The memLevel parameter specifies how much memory should be allocated for
+  // the internal compression state.
+  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+  // memLevel=9 uses maximum memory for optimal speed.
+  // The default value is 8. See zconf.h for more details.
+  static const int memLevel = 8;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    level = Z_DEFAULT_COMPRESSION;
+  } else {
+    level = info.options().level;
+  }
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+  int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits,
+                        memLevel, info.options().strategy);
+  if (st != Z_OK) {
+    return false;
+  }
+
+  Slice compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    // Initialize the compression library's dictionary
+    st = deflateSetDictionary(
+        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
+        static_cast<unsigned int>(compression_dict.size()));
+    if (st != Z_OK) {
+      deflateEnd(&_stream);
+      return false;
+    }
+  }
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (Bytef*)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
+
+  bool compressed = false;
+  st = deflate(&_stream, Z_FINISH);
+  if (st == Z_STREAM_END) {
+    compressed = true;
+    output->resize(output->size() - _stream.avail_out);
+  }
+  // The only return value we really care about is Z_STREAM_END.
+  // Z_OK means insufficient output space. This means the compression is
+  // bigger than decompressed size. Just fail the compression in that case.
+
+  deflateEnd(&_stream);
+  return compressed;
+#else
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline CacheAllocationPtr Zlib_Uncompress(
+    const UncompressionInfo& info, const char* input_data, size_t input_length,
+    int* decompress_size, uint32_t compress_format_version,
+    MemoryAllocator* allocator = nullptr, int windowBits = -14) {
+#ifdef ZLIB
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st =
+      inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits);
+  if (st != Z_OK) {
+    return nullptr;
+  }
+
+  Slice compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    // Initialize the compression library's dictionary
+    st = inflateSetDictionary(
+        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
+        static_cast<unsigned int>(compression_dict.size()));
+    if (st != Z_OK) {
+      return nullptr;
+    }
+  }
+
+  _stream.next_in = (Bytef*)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  auto output = AllocateBlock(output_len, allocator);
+
+  _stream.next_out = (Bytef*)output.get();
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  bool done = false;
+  while (!done) {
+    st = inflate(&_stream, Z_SYNC_FLUSH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK: {
+        // No output space. Increase the output space by 20%.
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        size_t old_sz = output_len;
+        uint32_t output_len_delta = output_len / 5;
+        output_len += output_len_delta < 10 ? 10 : output_len_delta;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
+
+        // Set more output.
+        _stream.next_out = (Bytef*)(output.get() + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      }
+      case Z_BUF_ERROR:
+      default:
+        inflateEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  inflateEnd(&_stream);
+  return output;
+#else
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)compress_format_version;
+  (void)allocator;
+  (void)windowBits;
+  return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool BZip2_Compress(const CompressionInfo& /*info*/,
+                           uint32_t compress_format_version, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef BZIP2
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(output_header_len + length);
+
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  // Block size 1 is 100K.
+  // 0 is for silent.
+  // 30 is the default workFactor
+  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
+  if (st != BZ_OK) {
+    return false;
+  }
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (char*)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
+
+  bool compressed = false;
+  st = BZ2_bzCompress(&_stream, BZ_FINISH);
+  if (st == BZ_STREAM_END) {
+    compressed = true;
+    output->resize(output->size() - _stream.avail_out);
+  }
+  // The only return value we really care about is BZ_STREAM_END.
+  // BZ_FINISH_OK means insufficient output space. This means the compression
+  // is bigger than decompressed size. Just fail the compression in that case.
+
+  BZ2_bzCompressEnd(&_stream);
+  return compressed;
+#else
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline CacheAllocationPtr BZip2_Uncompress(
+    const char* input_data, size_t input_length, int* decompress_size,
+    uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
+#ifdef BZIP2
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the next page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
+  if (st != BZ_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (char*)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  auto output = AllocateBlock(output_len, allocator);
+
+  _stream.next_out = (char*)output.get();
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  bool done = false;
+  while (!done) {
+    st = BZ2_bzDecompress(&_stream);
+    switch (st) {
+      case BZ_STREAM_END:
+        done = true;
+        break;
+      case BZ_OK: {
+        // No output space. Increase the output space by 20%.
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        uint32_t old_sz = output_len;
+        output_len = output_len * 1.2;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
+
+        // Set more output.
+        _stream.next_out = (char*)(output.get() + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      }
+      default:
+        BZ2_bzDecompressEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  BZ2_bzDecompressEnd(&_stream);
+  return output;
+#else
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)compress_format_version;
+  (void)allocator;
+  return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline bool LZ4_Compress(const CompressionInfo& info,
+                         uint32_t compress_format_version, const char* input,
+                         size_t length, ::std::string* output) {
+#ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+  int compress_bound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(output_header_len + compress_bound));
+
+  int outlen;
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  LZ4_stream_t* stream = LZ4_createStream();
+  Slice compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    LZ4_loadDict(stream, compression_dict.data(),
+                 static_cast<int>(compression_dict.size()));
+  }
+#if LZ4_VERSION_NUMBER >= 10700  // r129+
+  outlen =
+      LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len],
+                                 static_cast<int>(length), compress_bound, 1);
+#else  // up to r128
+  outlen = LZ4_compress_limitedOutput_continue(
+      stream, input, &(*output)[output_header_len], static_cast<int>(length),
+      compress_bound);
+#endif
+  LZ4_freeStream(stream);
+#else   // up to r123
+  outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
+                                      static_cast<int>(length), compress_bound);
+  (void)ctx;
+#endif  // LZ4_VERSION_NUMBER >= 10400
+
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(output_header_len + outlen));
+  return true;
+#else  // LZ4
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
+                                         const char* input_data,
+                                         size_t input_length,
+                                         int* decompress_size,
+                                         uint32_t compress_format_version,
+                                         MemoryAllocator* allocator = nullptr) {
+#ifdef LZ4
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    if (input_length < 8) {
+      return nullptr;
+    }
+    memcpy(&output_len, input_data, sizeof(output_len));
+    input_length -= 8;
+    input_data += 8;
+  }
+
+  auto output = AllocateBlock(output_len, allocator);
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
+  Slice compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    LZ4_setStreamDecode(stream, compression_dict.data(),
+                        static_cast<int>(compression_dict.size()));
+  }
+  *decompress_size = LZ4_decompress_safe_continue(
+      stream, input_data, output.get(), static_cast<int>(input_length),
+      static_cast<int>(output_len));
+  LZ4_freeStreamDecode(stream);
+#else   // up to r123
+  *decompress_size = LZ4_decompress_safe(input_data, output.get(),
+                                         static_cast<int>(input_length),
+                                         static_cast<int>(output_len));
+  (void)ctx;
+#endif  // LZ4_VERSION_NUMBER >= 10400
+
+  if (*decompress_size < 0) {
+    return nullptr;
+  }
+  assert(*decompress_size == static_cast<int>(output_len));
+  return output;
+#else  // LZ4
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)compress_format_version;
+  (void)allocator;
+  return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline bool LZ4HC_Compress(const CompressionInfo& info,
+                           uint32_t compress_format_version, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+  int compress_bound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(output_header_len + compress_bound));
+
+  int outlen;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
+  } else {
+    level = info.options().level;
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  LZ4_streamHC_t* stream = LZ4_createStreamHC();
+  LZ4_resetStreamHC(stream, level);
+  Slice compression_dict = info.dict().GetRawDict();
+  const char* compression_dict_data =
+      compression_dict.size() > 0 ? compression_dict.data() : nullptr;
+  size_t compression_dict_size = compression_dict.size();
+  LZ4_loadDictHC(stream, compression_dict_data,
+                 static_cast<int>(compression_dict_size));
+
+#if LZ4_VERSION_NUMBER >= 10700  // r129+
+  outlen =
+      LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len],
+                               static_cast<int>(length), compress_bound);
+#else   // r124-r128
+  outlen = LZ4_compressHC_limitedOutput_continue(
+      stream, input, &(*output)[output_header_len], static_cast<int>(length),
+      compress_bound);
+#endif  // LZ4_VERSION_NUMBER >= 10700
+  LZ4_freeStreamHC(stream);
+
+#elif LZ4_VERSION_MAJOR  // r113-r123
+  outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
+                                         static_cast<int>(length),
+                                         compress_bound, level);
+#else                    // up to r112
+  outlen =
+      LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
+                                   static_cast<int>(length), compress_bound);
+#endif                   // LZ4_VERSION_NUMBER >= 10400
+
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(output_header_len + outlen));
+  return true;
+#else  // LZ4
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+#ifdef XPRESS
+inline bool XPRESS_Compress(const char* input, size_t length,
+                            std::string* output) {
+  return port::xpress::Compress(input, length, output);
+}
+#else
+inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/,
+                            std::string* /*output*/) {
+  return false;
+}
+#endif
+
+#ifdef XPRESS
+inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
+                               int* decompress_size) {
+  return port::xpress::Decompress(input_data, input_length, decompress_size);
+}
+#else
+inline char* XPRESS_Uncompress(const char* /*input_data*/,
+                               size_t /*input_length*/,
+                               int* /*decompress_size*/) {
+  return nullptr;
+}
+#endif
+
+inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZSTD
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = compression::PutDecompressedSizeInfo(
+      output, static_cast<uint32_t>(length));
+
+  size_t compressBound = ZSTD_compressBound(length);
+  output->resize(static_cast<size_t>(output_header_len + compressBound));
+  size_t outlen = 0;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+    // https://github.com/facebook/zstd/issues/1148
+    level = 3;
+  } else {
+    level = info.options().level;
+  }
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+  ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
+  assert(context != nullptr);
+#if ZSTD_VERSION_NUMBER >= 700  // v0.7.0+
+  if (info.dict().GetDigestedZstdCDict() != nullptr) {
+    outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len],
+                                      compressBound, input, length,
+                                      info.dict().GetDigestedZstdCDict());
+  }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  if (outlen == 0) {
+    outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len],
+                                     compressBound, input, length,
+                                     info.dict().GetRawDict().data(),
+                                     info.dict().GetRawDict().size(), level);
+  }
+#else   // up to v0.4.x
+  outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input,
+                         length, level);
+#endif  // ZSTD_VERSION_NUMBER >= 500
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(output_header_len + outlen);
+  return true;
+#else  // ZSTD
+  (void)info;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline CacheAllocationPtr ZSTD_Uncompress(
+    const UncompressionInfo& info, const char* input_data, size_t input_length,
+    int* decompress_size, MemoryAllocator* allocator = nullptr) {
+#ifdef ZSTD
+  uint32_t output_len = 0;
+  if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                            &output_len)) {
+    return nullptr;
+  }
+
+  auto output = AllocateBlock(output_len, allocator);
+  size_t actual_output_length = 0;
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+  ZSTD_DCtx* context = info.context().GetZSTDContext();
+  assert(context != nullptr);
+#ifdef ROCKSDB_ZSTD_DDICT
+  if (info.dict().GetDigestedZstdDDict() != nullptr) {
+    actual_output_length = ZSTD_decompress_usingDDict(
+        context, output.get(), output_len, input_data, input_length,
+        info.dict().GetDigestedZstdDDict());
+  }
+#endif  // ROCKSDB_ZSTD_DDICT
+  if (actual_output_length == 0) {
+    actual_output_length = ZSTD_decompress_usingDict(
+        context, output.get(), output_len, input_data, input_length,
+        info.dict().GetRawDict().data(), info.dict().GetRawDict().size());
+  }
+#else   // up to v0.4.x
+  (void)info;
+  actual_output_length =
+      ZSTD_decompress(output.get(), output_len, input_data, input_length);
+#endif  // ZSTD_VERSION_NUMBER >= 500
+  assert(actual_output_length == output_len);
+  *decompress_size = static_cast<int>(actual_output_length);
+  return output;
+#else  // ZSTD
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)allocator;
+  return nullptr;
+#endif
+}
+
+inline bool ZSTD_TrainDictionarySupported() {
+#ifdef ZSTD
+  // Dictionary trainer is available since v0.6.1 for static linking, but not
+  // available for dynamic linking until v1.1.3. For now we enable the feature
+  // in v1.1.3+ only.
+  return (ZSTD_versionNumber() >= 10103);
+#else
+  return false;
+#endif
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+                                        const std::vector<size_t>& sample_lens,
+                                        size_t max_dict_bytes) {
+  // Dictionary trainer is available since v0.6.1 for static linking, but not
+  // available for dynamic linking until v1.1.3. For now we enable the feature
+  // in v1.1.3+ only.
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_trainFromBuffer(
+      &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
+      static_cast<unsigned>(sample_lens.size()));
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  }
+  assert(dict_len <= max_dict_bytes);
+  dict_data.resize(dict_len);
+  return dict_data;
+#else   // up to v1.1.2
+  assert(false);
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+                                        size_t sample_len_shift,
+                                        size_t max_dict_bytes) {
+  // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable
+  // only since v0.8.0. For now we enable the feature in stable versions only.
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+  // skips potential partial sample at the end of "samples"
+  size_t num_samples = samples.size() >> sample_len_shift;
+  std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
+  return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
+#else   // up to v1.1.2
+  assert(false);
+  (void)samples;
+  (void)sample_len_shift;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/compression_context_cache.cc b/src/rocksdb/util/compression_context_cache.cc
new file mode 100644
index 00000000..6fb5c4fc
--- /dev/null
+++ b/src/rocksdb/util/compression_context_cache.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#include "util/compression_context_cache.h"
+
+#include "util/compression.h"
+#include "util/core_local.h"
+
+#include <atomic>
+
+namespace rocksdb {
+namespace compression_cache {
+
+void* const SentinelValue = nullptr;
+// Cache ZSTD uncompression contexts for reads
+// if needed we can add ZSTD compression context caching
+// which is currently is not done since BlockBasedTableBuilder
+// simply creates one compression context per new SST file.
+struct ZSTDCachedData {
+  // We choose to cache the below structure instead of a ptr
+  // because we want to avoid a) native types leak b) make
+  // cache use transparent for the user
+  ZSTDUncompressCachedData uncomp_cached_data_;
+  std::atomic<void*> zstd_uncomp_sentinel_;
+
+  char
+      padding[(CACHE_LINE_SIZE -
+               (sizeof(ZSTDUncompressCachedData) + sizeof(std::atomic<void*>)) %
+                   CACHE_LINE_SIZE)];  // unused padding field
+
+  ZSTDCachedData() : zstd_uncomp_sentinel_(&uncomp_cached_data_) {}
+  ZSTDCachedData(const ZSTDCachedData&) = delete;
+  ZSTDCachedData& operator=(const ZSTDCachedData&) = delete;
+
+  ZSTDUncompressCachedData GetUncompressData(int64_t idx) {
+    ZSTDUncompressCachedData result;
+    void* expected = &uncomp_cached_data_;
+    if (zstd_uncomp_sentinel_.compare_exchange_strong(expected,
+                                                      SentinelValue)) {
+      uncomp_cached_data_.CreateIfNeeded();
+      result.InitFromCache(uncomp_cached_data_, idx);
+    } else {
+      // Creates one time use data
+      result.CreateIfNeeded();
+    }
+    return result;
+  }
+  // Return the entry back into circulation
+  // This is executed only when we successfully obtained
+  // in the first place
+  void ReturnUncompressData() {
+    if (zstd_uncomp_sentinel_.exchange(&uncomp_cached_data_) != SentinelValue) {
+      // Means we are returning while not having it acquired.
+      assert(false);
+    }
+  }
+};
+static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0,
+              "Expected CACHE_LINE_SIZE alignment");
+}  // namespace compression_cache
+
+using namespace compression_cache;
+
+class CompressionContextCache::Rep {
+ public:
+  Rep() {}
+  ZSTDUncompressCachedData GetZSTDUncompressData() {
+    auto p = per_core_uncompr_.AccessElementAndIndex();
+    int64_t idx = static_cast<int64_t>(p.second);
+    return p.first->GetUncompressData(idx);
+  }
+  void ReturnZSTDUncompressData(int64_t idx) {
+    assert(idx >= 0);
+    auto* cn = per_core_uncompr_.AccessAtCore(static_cast<size_t>(idx));
+    cn->ReturnUncompressData();
+  }
+
+ private:
+  CoreLocalArray<ZSTDCachedData> per_core_uncompr_;
+};
+
+CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {}
+
+CompressionContextCache* CompressionContextCache::Instance() {
+  static CompressionContextCache instance;
+  return &instance;
+}
+
+void CompressionContextCache::InitSingleton() { Instance(); }
+
+ZSTDUncompressCachedData
+CompressionContextCache::GetCachedZSTDUncompressData() {
+  return rep_->GetZSTDUncompressData();
+}
+
+void CompressionContextCache::ReturnCachedZSTDUncompressData(int64_t idx) {
+  rep_->ReturnZSTDUncompressData(idx);
+}
+
+CompressionContextCache::~CompressionContextCache() { delete rep_; }
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/compression_context_cache.h b/src/rocksdb/util/compression_context_cache.h
new file mode 100644
index 00000000..4ea6ae35
--- /dev/null
+++ b/src/rocksdb/util/compression_context_cache.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+// Compression context cache allows to cache compression/uncompression contexts
+// This helps with Random Read latencies and reduces CPU utilization
+// Caching is implemented using CoreLocal facility. Compression/Uncompression
+// instances are cached on a per core basis using CoreLocalArray. A borrowed
+// instance is atomically replaced with a sentinel value for the time of being
+// used. If it turns out that another thread is already makes use of the
+// instance we still create one on the heap which is later is destroyed.
+
+#pragma once
+
+#include <stdint.h>
+
+namespace rocksdb {
+class ZSTDUncompressCachedData;
+
+class CompressionContextCache {
+ public:
+  // Singleton
+  static CompressionContextCache* Instance();
+  static void InitSingleton();
+  CompressionContextCache(const CompressionContextCache&) = delete;
+  CompressionContextCache& operator=(const CompressionContextCache&) = delete;
+
+  ZSTDUncompressCachedData GetCachedZSTDUncompressData();
+  void ReturnCachedZSTDUncompressData(int64_t idx);
+
+ private:
+  // Singleton
+  CompressionContextCache();
+  ~CompressionContextCache();
+
+  class Rep;
+  Rep* rep_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/concurrent_arena.cc b/src/rocksdb/util/concurrent_arena.cc
new file mode 100644
index 00000000..cef77d7e
--- /dev/null
+++ b/src/rocksdb/util/concurrent_arena.cc
@@ -0,0 +1,47 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/concurrent_arena.h"
+#include <thread>
+#include "port/port.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+__thread size_t ConcurrentArena::tls_cpuid = 0;
+#endif
+
+namespace {
+// If the shard block size is too large, in the worst case, every core
+// allocates a block without populate it. If the shared block size is
+// 1MB, 64 cores will quickly allocate 64MB, and may quickly trigger a
+// flush. Cap the size instead.
+const size_t kMaxShardBlockSize = size_t{128 * 1024};
+}  // namespace
+
+ConcurrentArena::ConcurrentArena(size_t block_size, AllocTracker* tracker,
+                                 size_t huge_page_size)
+    : shard_block_size_(std::min(kMaxShardBlockSize, block_size / 8)),
+      shards_(),
+      arena_(block_size, tracker, huge_page_size) {
+  Fixup();
+}
+
+ConcurrentArena::Shard* ConcurrentArena::Repick() {
+  auto shard_and_index = shards_.AccessElementAndIndex();
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+  // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we
+  // have repicked
+  tls_cpuid = shard_and_index.second | shards_.Size();
+#endif
+  return shard_and_index.first;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/concurrent_arena.h b/src/rocksdb/util/concurrent_arena.h
new file mode 100644
index 00000000..a6191100
--- /dev/null
+++ b/src/rocksdb/util/concurrent_arena.h
@@ -0,0 +1,215 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <memory>
+#include <utility>
+#include "port/likely.h"
+#include "util/allocator.h"
+#include "util/arena.h"
+#include "util/core_local.h"
+#include "util/mutexlock.h"
+#include "util/thread_local.h"
+
+// Only generate field unused warning for padding array, or build under
+// GCC 4.8.1 will fail.
+#ifdef __clang__
+#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__))
+#else
+#define ROCKSDB_FIELD_UNUSED
+#endif  // __clang__
+
+namespace rocksdb {
+
+class Logger;
+
+// ConcurrentArena wraps an Arena.  It makes it thread safe using a fast
+// inlined spinlock, and adds small per-core allocation caches to avoid
+// contention for small allocations.  To avoid any memory waste from the
+// per-core shards, they are kept small, they are lazily instantiated
+// only if ConcurrentArena actually notices concurrent use, and they
+// adjust their size so that there is no fragmentation waste when the
+// shard blocks are allocated from the underlying main arena.
+class ConcurrentArena : public Allocator {
+ public:
+  // block_size and huge_page_size are the same as for Arena (and are
+  // in fact just passed to the constructor of arena_.  The core-local
+  // shards compute their shard_block_size as a fraction of block_size
+  // that varies according to the hardware concurrency level.
+  explicit ConcurrentArena(size_t block_size = Arena::kMinBlockSize,
+                           AllocTracker* tracker = nullptr,
+                           size_t huge_page_size = 0);
+
+  char* Allocate(size_t bytes) override {
+    return AllocateImpl(bytes, false /*force_arena*/,
+                        [=]() { return arena_.Allocate(bytes); });
+  }
+
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override {
+    size_t rounded_up = ((bytes - 1) | (sizeof(void*) - 1)) + 1;
+    assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) &&
+           (rounded_up % sizeof(void*)) == 0);
+
+    return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, [=]() {
+      return arena_.AllocateAligned(rounded_up, huge_page_size, logger);
+    });
+  }
+
+  size_t ApproximateMemoryUsage() const {
+    std::unique_lock<SpinMutex> lock(arena_mutex_, std::defer_lock);
+    lock.lock();
+    return arena_.ApproximateMemoryUsage() - ShardAllocatedAndUnused();
+  }
+
+  size_t MemoryAllocatedBytes() const {
+    return memory_allocated_bytes_.load(std::memory_order_relaxed);
+  }
+
+  size_t AllocatedAndUnused() const {
+    return arena_allocated_and_unused_.load(std::memory_order_relaxed) +
+           ShardAllocatedAndUnused();
+  }
+
+  size_t IrregularBlockNum() const {
+    return irregular_block_num_.load(std::memory_order_relaxed);
+  }
+
+  size_t BlockSize() const override { return arena_.BlockSize(); }
+
+ private:
+  struct Shard {
+    char padding[40] ROCKSDB_FIELD_UNUSED;
+    mutable SpinMutex mutex;
+    char* free_begin_;
+    std::atomic<size_t> allocated_and_unused_;
+
+    Shard() : free_begin_(nullptr), allocated_and_unused_(0) {}
+  };
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+  static __thread size_t tls_cpuid;
+#else
+  enum ZeroFirstEnum : size_t { tls_cpuid = 0 };
+#endif
+
+  char padding0[56] ROCKSDB_FIELD_UNUSED;
+
+  size_t shard_block_size_;
+
+  CoreLocalArray<Shard> shards_;
+
+  Arena arena_;
+  mutable SpinMutex arena_mutex_;
+  std::atomic<size_t> arena_allocated_and_unused_;
+  std::atomic<size_t> memory_allocated_bytes_;
+  std::atomic<size_t> irregular_block_num_;
+
+  char padding1[56] ROCKSDB_FIELD_UNUSED;
+
+  Shard* Repick();
+
+  size_t ShardAllocatedAndUnused() const {
+    size_t total = 0;
+    for (size_t i = 0; i < shards_.Size(); ++i) {
+      total += shards_.AccessAtCore(i)->allocated_and_unused_.load(
+          std::memory_order_relaxed);
+    }
+    return total;
+  }
+
+  template <typename Func>
+  char* AllocateImpl(size_t bytes, bool force_arena, const Func& func) {
+    size_t cpu;
+
+    // Go directly to the arena if the allocation is too large, or if
+    // we've never needed to Repick() and the arena mutex is available
+    // with no waiting.  This keeps the fragmentation penalty of
+    // concurrency zero unless it might actually confer an advantage.
+    std::unique_lock<SpinMutex> arena_lock(arena_mutex_, std::defer_lock);
+    if (bytes > shard_block_size_ / 4 || force_arena ||
+        ((cpu = tls_cpuid) == 0 &&
+         !shards_.AccessAtCore(0)->allocated_and_unused_.load(
+             std::memory_order_relaxed) &&
+         arena_lock.try_lock())) {
+      if (!arena_lock.owns_lock()) {
+        arena_lock.lock();
+      }
+      auto rv = func();
+      Fixup();
+      return rv;
+    }
+
+    // pick a shard from which to allocate
+    Shard* s = shards_.AccessAtCore(cpu & (shards_.Size() - 1));
+    if (!s->mutex.try_lock()) {
+      s = Repick();
+      s->mutex.lock();
+    }
+    std::unique_lock<SpinMutex> lock(s->mutex, std::adopt_lock);
+
+    size_t avail = s->allocated_and_unused_.load(std::memory_order_relaxed);
+    if (avail < bytes) {
+      // reload
+      std::lock_guard<SpinMutex> reload_lock(arena_mutex_);
+
+      // If the arena's current block is within a factor of 2 of the right
+      // size, we adjust our request to avoid arena waste.
+      auto exact = arena_allocated_and_unused_.load(std::memory_order_relaxed);
+      assert(exact == arena_.AllocatedAndUnused());
+
+      if (exact >= bytes && arena_.IsInInlineBlock()) {
+        // If we haven't exhausted arena's inline block yet, allocate from arena
+        // directly. This ensures that we'll do the first few small allocations
+        // without allocating any blocks.
+        // In particular this prevents empty memtables from using
+        // disproportionately large amount of memory: a memtable allocates on
+        // the order of 1 KB of memory when created; we wouldn't want to
+        // allocate a full arena block (typically a few megabytes) for that,
+        // especially if there are thousands of empty memtables.
+        auto rv = func();
+        Fixup();
+        return rv;
+      }
+
+      avail = exact >= shard_block_size_ / 2 && exact < shard_block_size_ * 2
+                  ? exact
+                  : shard_block_size_;
+      s->free_begin_ = arena_.AllocateAligned(avail);
+      Fixup();
+    }
+    s->allocated_and_unused_.store(avail - bytes, std::memory_order_relaxed);
+
+    char* rv;
+    if ((bytes % sizeof(void*)) == 0) {
+      // aligned allocation from the beginning
+      rv = s->free_begin_;
+      s->free_begin_ += bytes;
+    } else {
+      // unaligned from the end
+      rv = s->free_begin_ + avail - bytes;
+    }
+    return rv;
+  }
+
+  void Fixup() {
+    arena_allocated_and_unused_.store(arena_.AllocatedAndUnused(),
+                                      std::memory_order_relaxed);
+    memory_allocated_bytes_.store(arena_.MemoryAllocatedBytes(),
+                                  std::memory_order_relaxed);
+    irregular_block_num_.store(arena_.IrregularBlockNum(),
+                               std::memory_order_relaxed);
+  }
+
+  ConcurrentArena(const ConcurrentArena&) = delete;
+  ConcurrentArena& operator=(const ConcurrentArena&) = delete;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.cc b/src/rocksdb/util/concurrent_task_limiter_impl.cc
new file mode 100644
index 00000000..e1ce4bef
--- /dev/null
+++ b/src/rocksdb/util/concurrent_task_limiter_impl.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/concurrent_task_limiter_impl.h"
+#include "rocksdb/concurrent_task_limiter.h"
+
+namespace rocksdb {
+
+ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl(
+    const std::string& name, int32_t max_outstanding_task)
+    : name_(name),
+      max_outstanding_tasks_{max_outstanding_task},
+      outstanding_tasks_{0} {
+
+}
+
+ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() {
+  assert(outstanding_tasks_ == 0);
+}
+
+const std::string& ConcurrentTaskLimiterImpl::GetName() const {
+  return name_;
+}
+
+void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) {
+  max_outstanding_tasks_.store(limit, std::memory_order_relaxed);
+}
+
+void ConcurrentTaskLimiterImpl::ResetMaxOutstandingTask() {
+  max_outstanding_tasks_.store(-1, std::memory_order_relaxed);
+}
+
+int32_t ConcurrentTaskLimiterImpl::GetOutstandingTask() const {
+  return outstanding_tasks_.load(std::memory_order_relaxed);
+}
+
+std::unique_ptr<TaskLimiterToken> ConcurrentTaskLimiterImpl::GetToken(
+    bool force) {
+  int32_t limit = max_outstanding_tasks_.load(std::memory_order_relaxed);
+  int32_t tasks = outstanding_tasks_.load(std::memory_order_relaxed);
+  // force = true, bypass the throttle.
+  // limit < 0 means unlimited tasks.
+  while (force || limit < 0 || tasks < limit) {
+    if (outstanding_tasks_.compare_exchange_weak(tasks, tasks + 1)) {
+      return std::unique_ptr<TaskLimiterToken>(new TaskLimiterToken(this));
+    }
+  }
+  return nullptr;
+}
+
+ConcurrentTaskLimiter* NewConcurrentTaskLimiter(
+    const std::string& name, int32_t limit) {
+  return new ConcurrentTaskLimiterImpl(name, limit);
+}
+
+TaskLimiterToken::~TaskLimiterToken() {
+  --limiter_->outstanding_tasks_;
+  assert(limiter_->outstanding_tasks_ >= 0);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.h b/src/rocksdb/util/concurrent_task_limiter_impl.h
new file mode 100644
index 00000000..515f1481
--- /dev/null
+++ b/src/rocksdb/util/concurrent_task_limiter_impl.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/env.h"
+#include "rocksdb/concurrent_task_limiter.h"
+
+namespace rocksdb {
+
+class TaskLimiterToken;
+
+class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
+ public:
+  explicit ConcurrentTaskLimiterImpl(const std::string& name,
+                                     int32_t max_outstanding_task);
+
+  virtual ~ConcurrentTaskLimiterImpl();
+
+  virtual const std::string& GetName() const override;
+
+  virtual void SetMaxOutstandingTask(int32_t limit) override;
+
+  virtual void ResetMaxOutstandingTask() override;
+
+  virtual int32_t GetOutstandingTask() const override;
+
+  // Request token for adding a new task.
+  // If force == true, it requests a token bypassing throttle.
+  // Returns nullptr if it got throttled.
+  virtual std::unique_ptr<TaskLimiterToken> GetToken(bool force);
+
+ private:
+  friend class TaskLimiterToken;
+
+  std::string name_;
+  std::atomic<int32_t> max_outstanding_tasks_;
+  std::atomic<int32_t> outstanding_tasks_;
+
+  // No copying allowed
+  ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
+  ConcurrentTaskLimiterImpl& operator=(
+      const ConcurrentTaskLimiterImpl&) = delete;
+};
+
+class TaskLimiterToken {
+ public:
+  explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter)
+      : limiter_(limiter) {}
+  ~TaskLimiterToken();
+
+ private:
+  ConcurrentTaskLimiterImpl* limiter_;
+
+  // no copying allowed
+  TaskLimiterToken(const TaskLimiterToken&) = delete;
+  void operator=(const TaskLimiterToken&) = delete;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h
new file mode 100644
index 00000000..4cc4fd90
--- /dev/null
+++ b/src/rocksdb/util/core_local.h
@@ -0,0 +1,83 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+// An array of core-local values. Ideally the value type, T, is cache aligned to
+// prevent false sharing.
+template <typename T>
+class CoreLocalArray {
+ public:
+  CoreLocalArray();
+
+  size_t Size() const;
+  // returns pointer to the element corresponding to the core that the thread
+  // currently runs on.
+  T* Access() const;
+  // same as above, but also returns the core index, which the client can cache
+  // to reduce how often core ID needs to be retrieved. Only do this if some
+  // inaccuracy is tolerable, as the thread may migrate to a different core.
+  std::pair<T*, size_t> AccessElementAndIndex() const;
+  // returns pointer to element for the specified core index. This can be used,
+  // e.g., for aggregation, or if the client caches core index.
+  T* AccessAtCore(size_t core_idx) const;
+
+ private:
+  std::unique_ptr<T[]> data_;
+  int size_shift_;
+};
+
+template <typename T>
+CoreLocalArray<T>::CoreLocalArray() {
+  int num_cpus = static_cast<int>(std::thread::hardware_concurrency());
+  // find a power of two >= num_cpus and >= 8
+  size_shift_ = 3;
+  while (1 << size_shift_ < num_cpus) {
+    ++size_shift_;
+  }
+  data_.reset(new T[static_cast<size_t>(1) << size_shift_]);
+}
+
+template <typename T>
+size_t CoreLocalArray<T>::Size() const {
+  return static_cast<size_t>(1) << size_shift_;
+}
+
+template <typename T>
+T* CoreLocalArray<T>::Access() const {
+  return AccessElementAndIndex().first;
+}
+
+template <typename T>
+std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
+  int cpuid = port::PhysicalCoreID();
+  size_t core_idx;
+  if (UNLIKELY(cpuid < 0)) {
+    // cpu id unavailable, just pick randomly
+    core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_);
+  } else {
+    core_idx = static_cast<size_t>(cpuid & ((1 << size_shift_) - 1));
+  }
+  return {AccessAtCore(core_idx), core_idx};
+}
+
+template <typename T>
+T* CoreLocalArray<T>::AccessAtCore(size_t core_idx) const {
+  assert(core_idx < static_cast<size_t>(1) << size_shift_);
+  return &data_[core_idx];
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/crc32c.cc b/src/rocksdb/util/crc32c.cc
new file mode 100644
index 00000000..9e4b65e6
--- /dev/null
+++ b/src/rocksdb/util/crc32c.cc
@@ -0,0 +1,1231 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A portable implementation of crc32c, optimized to handle
+// four bytes at a time.
+#include "util/crc32c.h"
+#include <stdint.h>
+#ifdef HAVE_SSE42
+#include <nmmintrin.h>
+#include <wmmintrin.h>
+#endif
+#include "util/coding.h"
+#include "util/util.h"
+
+#ifdef __powerpc64__
+#include "util/crc32c_ppc.h"
+#include "util/crc32c_ppc_constants.h"
+
+#if __linux__
+#include <sys/auxv.h>
+
+#ifndef PPC_FEATURE2_VEC_CRYPTO
+#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#endif /* __linux__ */
+
+#endif
+
+namespace rocksdb {
+namespace crc32c {
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+#ifdef __powerpc64__
+static int arch_ppc_crc32 = 0;
+#endif /* __powerpc64__ */
+#endif
+
+static const uint32_t table0_[256] = {
+  0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+  0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+  0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+  0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+  0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+  0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+  0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+  0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+  0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+  0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+  0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+  0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+  0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+  0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+  0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+  0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+  0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+  0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+  0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+  0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+  0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+  0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+  0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+  0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+  0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+  0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+  0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+  0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+  0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+  0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+  0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+  0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+  0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+  0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+  0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+  0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+  0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+  0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+  0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+  0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+  0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+  0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+  0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+  0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+  0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+  0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+  0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+  0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+  0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+  0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+  0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+  0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+  0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+  0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+  0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+  0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+  0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+  0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+  0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+  0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+  0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+  0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+  0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+  0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+};
+static const uint32_t table1_[256] = {
+  0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+  0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+  0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+  0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+  0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+  0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+  0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+  0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+  0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+  0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+  0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+  0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+  0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+  0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+  0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+  0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+  0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+  0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+  0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+  0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+  0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+  0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+  0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+  0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+  0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+  0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+  0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+  0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+  0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+  0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+  0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+  0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+  0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+  0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+  0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+  0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+  0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+  0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+  0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+  0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+  0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+  0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+  0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+  0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+  0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+  0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+  0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+  0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+  0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+  0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+  0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+  0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+  0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+  0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+  0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+  0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+  0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+  0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+  0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+  0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+  0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+  0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+  0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+  0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+};
+static const uint32_t table2_[256] = {
+  0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+  0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+  0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+  0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+  0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+  0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+  0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+  0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+  0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+  0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+  0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+  0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+  0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+  0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+  0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+  0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+  0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+  0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+  0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+  0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+  0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+  0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+  0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+  0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+  0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+  0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+  0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+  0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+  0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+  0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+  0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+  0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+  0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+  0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+  0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+  0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+  0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+  0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+  0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+  0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+  0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+  0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+  0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+  0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+  0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+  0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+  0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+  0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+  0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+  0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+  0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+  0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+  0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+  0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+  0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+  0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+  0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+  0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+  0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+  0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+  0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+  0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+  0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+  0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+};
+static const uint32_t table3_[256] = {
+  0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+  0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+  0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+  0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+  0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+  0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+  0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+  0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+  0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+  0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+  0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+  0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+  0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+  0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+  0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+  0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+  0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+  0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+  0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+  0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+  0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+  0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+  0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+  0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+  0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+  0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+  0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+  0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+  0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+  0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+  0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+  0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+  0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+  0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+  0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+  0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+  0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+  0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+  0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+  0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+  0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+  0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+  0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+  0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+  0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+  0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+  0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+  0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+  0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+  0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+  0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+  0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+  0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+  0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+  0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+  0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+  0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+  0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+  0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+  0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+  0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+  0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+  0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+  0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+};
+
+// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
+static inline uint32_t LE_LOAD32(const uint8_t *p) {
+  return DecodeFixed32(reinterpret_cast<const char*>(p));
+}
+
+#if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64))
+static inline uint64_t LE_LOAD64(const uint8_t *p) {
+  return DecodeFixed64(reinterpret_cast<const char*>(p));
+}
+#endif
+
+static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
+  uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+  // DO it twice.
+  c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+  *p += 4;
+  *l = table3_[c & 0xff] ^
+  table2_[(c >> 8) & 0xff] ^
+  table1_[(c >> 16) & 0xff] ^
+  table0_[c >> 24];
+}
+
+static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
+#ifndef HAVE_SSE42
+  Slow_CRC32(l, p);
+#elif defined(__LP64__) || defined(_WIN64)
+  *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
+  *p += 8;
+#else
+  *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+  *p += 4;
+  *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+  *p += 4;
+#endif
+}
+
+template<void (*CRC32)(uint64_t*, uint8_t const**)>
+uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
+
+  const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+  const uint8_t *e = p + size;
+  uint64_t l = crc ^ 0xffffffffu;
+
+// Align n to (1 << m) byte boundary
+#define ALIGN(n, m)     ((n + ((1 << m) - 1)) & ~((1 << m) - 1))
+
+#define STEP1 do {                              \
+    int c = (l & 0xff) ^ *p++;                  \
+    l = table0_[c] ^ (l >> 8);                  \
+} while (0)
+
+
+  // Point x at first 16-byte aligned byte in string.  This might be
+  // just past the end of the string.
+  const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+  const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
+  if (x <= e) {
+    // Process bytes until finished or p is 16-byte aligned
+    while (p != x) {
+      STEP1;
+    }
+  }
+  // Process bytes 16 at a time
+  while ((e-p) >= 16) {
+    CRC32(&l, &p);
+    CRC32(&l, &p);
+  }
+  // Process bytes 8 at a time
+  while ((e-p) >= 8) {
+    CRC32(&l, &p);
+  }
+  // Process the last few bytes
+  while (p != e) {
+    STEP1;
+  }
+#undef STEP1
+#undef ALIGN
+  return static_cast<uint32_t>(l ^ 0xffffffffu);
+}
+
+// Detect if SS42 or not.
+#ifndef HAVE_POWER8
+
+static bool isSSE42() {
+#ifndef HAVE_SSE42
+  return false;
+#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+  uint32_t c_;
+  __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+  return c_ & (1U << 20);  // copied from CpuId.h in Folly. Test SSE42
+#elif defined(_WIN64)
+  int info[4];
+  __cpuidex(info, 0x00000001, 0);
+  return (info[2] & ((int)1 << 20)) != 0;
+#else
+  return false;
+#endif
+}
+
+static bool isPCLMULQDQ() {
+#ifndef HAVE_SSE42
+// in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ are
+// supported by compiler
+  return false;
+#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+  uint32_t c_;
+  __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+  return c_ & (1U << 1);  // PCLMULQDQ is in bit 1 (not bit 0)
+#elif defined(_WIN64)
+  int info[4];
+  __cpuidex(info, 0x00000001, 0);
+  return (info[2] & ((int)1 << 1)) != 0;
+#else
+  return false;
+#endif
+}
+
+#endif  // HAVE_POWER8
+
+typedef uint32_t (*Function)(uint32_t, const char*, size_t);
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) {
+  return crc32c_ppc(crc, (const unsigned char *)buf, size);
+}
+
+#if __linux__
+static int arch_ppc_probe(void) {
+  arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__)
+  if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif /* __powerpc64__ */
+
+  return arch_ppc_crc32;
+}
+#endif  // __linux__
+
+static bool isAltiVec() {
+  if (arch_ppc_probe()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
+
+
+std::string IsFastCrc32Supported() {
+  bool has_fast_crc = false;
+  std::string fast_zero_msg;
+  std::string arch;
+#ifdef HAVE_POWER8
+#ifdef HAS_ALTIVEC
+  if (arch_ppc_probe()) {
+    has_fast_crc = true;
+    arch = "PPC";
+  }
+#else
+  has_fast_crc = false;
+  arch = "PPC";
+#endif
+#else
+  has_fast_crc = isSSE42();
+  arch = "x86";
+#endif
+  if (has_fast_crc) {
+    fast_zero_msg.append("Supported on " + arch);
+  }
+  else {
+    fast_zero_msg.append("Not supported on " + arch);
+  }
+  return fast_zero_msg;
+}
+
+
+/*
+ * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
+ *  This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the author be held liable for any damages
+ * arising from the use of this software.
+ *  Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *  1. The origin of this software must not be misrepresented; you must not
+ *   claim that you wrote the original software. If you use this software
+ *   in a product, an acknowledgment in the product documentation would be
+ *   appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *   misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *  Ferry Toth
+ * ftoth@exalondelft.nl
+ *
+ * https://github.com/htot/crc32c
+ *
+ * Modified by Facebook
+ *
+ * Original intel whitepaper:
+ * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ *
+ * This version is from the folly library, created by Dave Watson <davejwatson@fb.com>
+ *
+*/
+#if defined HAVE_SSE42 && defined HAVE_PCLMUL
+
+#define CRCtriplet(crc, buf, offset)                  \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
+  crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
+
+#define CRCduplet(crc, buf, offset)                   \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
+
+#define CRCsinglet(crc, buf, offset)                    \
+  crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
+
+
+// Numbers taken directly from intel whitepaper.
+// clang-format off
+const uint64_t clmul_constants[] = {
+    0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
+    0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
+    0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
+    0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
+    0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
+    0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
+    0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
+    0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
+    0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
+    0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
+    0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
+    0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
+    0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
+    0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
+    0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
+    0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
+    0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
+    0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
+    0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
+    0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
+    0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
+    0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
+    0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
+    0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
+    0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
+    0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
+    0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
+    0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
+    0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
+    0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
+    0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
+    0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
+    0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
+    0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
+    0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
+    0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
+    0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
+    0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
+    0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
+    0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
+    0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
+    0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
+    0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
+    0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
+    0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
+    0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
+    0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
+    0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
+    0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
+    0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
+    0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
+    0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
+    0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
+    0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
+    0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
+    0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
+    0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
+    0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
+    0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
+    0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
+    0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
+    0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
+    0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
+    0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
+};
+
+// Compute the crc32c value for buffer smaller than 8
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void align_to_8(
+    size_t len,
+    uint64_t& crc0, // crc so far, updated on return
+    const unsigned char*& next) { // next data pointer, updated on return
+  uint32_t crc32bit = static_cast<uint32_t>(crc0);
+  if (len & 0x04) {
+    crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
+    next += sizeof(uint32_t);
+  }
+  if (len & 0x02) {
+    crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
+    next += sizeof(uint16_t);
+  }
+  if (len & 0x01) {
+    crc32bit = _mm_crc32_u8(crc32bit, *(next));
+    next++;
+  }
+  crc0 = crc32bit;
+}
+
+//
+// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
+// chosen constant and xor's these with the remaining CRC.
+//
+inline uint64_t CombineCRC(
+    size_t block_size,
+    uint64_t crc0,
+    uint64_t crc1,
+    uint64_t crc2,
+    const uint64_t* next2) {
+  const auto multiplier =
+      *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
+  const auto crc0_xmm = _mm_set_epi64x(0, crc0);
+  const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
+  const auto crc1_xmm = _mm_set_epi64x(0, crc1);
+  const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
+  const auto res = _mm_xor_si128(res0, res1);
+  crc0 = _mm_cvtsi128_si64(res);
+  crc0 = crc0 ^ *((uint64_t*)next2 - 1);
+  crc2 = _mm_crc32_u64(crc2, crc0);
+  return crc2;
+}
+
+// Compute CRC-32C using the Intel hardware instruction.
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
+  const unsigned char* next = (const unsigned char*)buf;
+  uint64_t count;
+  uint64_t crc0, crc1, crc2;
+  crc0 = crc ^ 0xffffffffu;
+
+
+  if (len >= 8) {
+    // if len > 216 then align and use triplets
+    if (len > 216) {
+      {
+        // Work on the bytes (< 8) before the first 8-byte alignment addr starts
+        uint64_t align_bytes = (8 - (uintptr_t)next) & 7;
+        len -= align_bytes;
+        align_to_8(align_bytes, crc0, next);
+      }
+
+      // Now work on the remaining blocks
+      count = len / 24; // number of triplets
+      len %= 24; // bytes remaining
+      uint64_t n = count >> 7; // #blocks = first block + full blocks
+      uint64_t block_size = count & 127;
+      if (block_size == 0) {
+        block_size = 128;
+      } else {
+        n++;
+      }
+      // points to the first byte of the next block
+      const uint64_t* next0 = (uint64_t*)next + block_size;
+      const uint64_t* next1 = next0 + block_size;
+      const uint64_t* next2 = next1 + block_size;
+
+      crc1 = crc2 = 0;
+      // Use Duff's device, a for() loop inside a switch()
+      // statement. This needs to execute at least once, round len
+      // down to nearest triplet multiple
+      switch (block_size) {
+        case 128:
+          do {
+            // jumps here for a full block of len 128
+            CRCtriplet(crc, next, -128);
+	    FALLTHROUGH_INTENDED;
+            case 127:
+              // jumps here or below for the first block smaller
+              CRCtriplet(crc, next, -127);
+	      FALLTHROUGH_INTENDED;
+            case 126:
+              CRCtriplet(crc, next, -126); // than 128
+	      FALLTHROUGH_INTENDED;
+            case 125:
+              CRCtriplet(crc, next, -125);
+	      FALLTHROUGH_INTENDED;
+            case 124:
+              CRCtriplet(crc, next, -124);
+	      FALLTHROUGH_INTENDED;
+            case 123:
+              CRCtriplet(crc, next, -123);
+	      FALLTHROUGH_INTENDED;
+            case 122:
+              CRCtriplet(crc, next, -122);
+	      FALLTHROUGH_INTENDED;
+            case 121:
+              CRCtriplet(crc, next, -121);
+	      FALLTHROUGH_INTENDED;
+            case 120:
+              CRCtriplet(crc, next, -120);
+              FALLTHROUGH_INTENDED;
+            case 119:
+              CRCtriplet(crc, next, -119);
+              FALLTHROUGH_INTENDED;
+            case 118:
+              CRCtriplet(crc, next, -118);
+              FALLTHROUGH_INTENDED;
+            case 117:
+              CRCtriplet(crc, next, -117);
+              FALLTHROUGH_INTENDED;
+            case 116:
+              CRCtriplet(crc, next, -116);
+              FALLTHROUGH_INTENDED;
+            case 115:
+              CRCtriplet(crc, next, -115);
+              FALLTHROUGH_INTENDED;
+            case 114:
+              CRCtriplet(crc, next, -114);
+              FALLTHROUGH_INTENDED;
+            case 113:
+              CRCtriplet(crc, next, -113);
+              FALLTHROUGH_INTENDED;
+            case 112:
+              CRCtriplet(crc, next, -112);
+              FALLTHROUGH_INTENDED;
+            case 111:
+              CRCtriplet(crc, next, -111);
+              FALLTHROUGH_INTENDED;
+            case 110:
+              CRCtriplet(crc, next, -110);
+              FALLTHROUGH_INTENDED;
+            case 109:
+              CRCtriplet(crc, next, -109);
+              FALLTHROUGH_INTENDED;
+            case 108:
+              CRCtriplet(crc, next, -108);
+              FALLTHROUGH_INTENDED;
+            case 107:
+              CRCtriplet(crc, next, -107);
+              FALLTHROUGH_INTENDED;
+            case 106:
+              CRCtriplet(crc, next, -106);
+              FALLTHROUGH_INTENDED;
+            case 105:
+              CRCtriplet(crc, next, -105);
+              FALLTHROUGH_INTENDED;
+            case 104:
+              CRCtriplet(crc, next, -104);
+              FALLTHROUGH_INTENDED;
+            case 103:
+              CRCtriplet(crc, next, -103);
+              FALLTHROUGH_INTENDED;
+            case 102:
+              CRCtriplet(crc, next, -102);
+              FALLTHROUGH_INTENDED;
+            case 101:
+              CRCtriplet(crc, next, -101);
+              FALLTHROUGH_INTENDED;
+            case 100:
+              CRCtriplet(crc, next, -100);
+              FALLTHROUGH_INTENDED;
+            case 99:
+              CRCtriplet(crc, next, -99);
+              FALLTHROUGH_INTENDED;
+            case 98:
+              CRCtriplet(crc, next, -98);
+              FALLTHROUGH_INTENDED;
+            case 97:
+              CRCtriplet(crc, next, -97);
+              FALLTHROUGH_INTENDED;
+            case 96:
+              CRCtriplet(crc, next, -96);
+              FALLTHROUGH_INTENDED;
+            case 95:
+              CRCtriplet(crc, next, -95);
+              FALLTHROUGH_INTENDED;
+            case 94:
+              CRCtriplet(crc, next, -94);
+              FALLTHROUGH_INTENDED;
+            case 93:
+              CRCtriplet(crc, next, -93);
+              FALLTHROUGH_INTENDED;
+            case 92:
+              CRCtriplet(crc, next, -92);
+              FALLTHROUGH_INTENDED;
+            case 91:
+              CRCtriplet(crc, next, -91);
+              FALLTHROUGH_INTENDED;
+            case 90:
+              CRCtriplet(crc, next, -90);
+              FALLTHROUGH_INTENDED;
+            case 89:
+              CRCtriplet(crc, next, -89);
+              FALLTHROUGH_INTENDED;
+            case 88:
+              CRCtriplet(crc, next, -88);
+              FALLTHROUGH_INTENDED;
+            case 87:
+              CRCtriplet(crc, next, -87);
+              FALLTHROUGH_INTENDED;
+            case 86:
+              CRCtriplet(crc, next, -86);
+              FALLTHROUGH_INTENDED;
+            case 85:
+              CRCtriplet(crc, next, -85);
+              FALLTHROUGH_INTENDED;
+            case 84:
+              CRCtriplet(crc, next, -84);
+              FALLTHROUGH_INTENDED;
+            case 83:
+              CRCtriplet(crc, next, -83);
+              FALLTHROUGH_INTENDED;
+            case 82:
+              CRCtriplet(crc, next, -82);
+              FALLTHROUGH_INTENDED;
+            case 81:
+              CRCtriplet(crc, next, -81);
+              FALLTHROUGH_INTENDED;
+            case 80:
+              CRCtriplet(crc, next, -80);
+              FALLTHROUGH_INTENDED;
+            case 79:
+              CRCtriplet(crc, next, -79);
+              FALLTHROUGH_INTENDED;
+            case 78:
+              CRCtriplet(crc, next, -78);
+              FALLTHROUGH_INTENDED;
+            case 77:
+              CRCtriplet(crc, next, -77);
+              FALLTHROUGH_INTENDED;
+            case 76:
+              CRCtriplet(crc, next, -76);
+              FALLTHROUGH_INTENDED;
+            case 75:
+              CRCtriplet(crc, next, -75);
+              FALLTHROUGH_INTENDED;
+            case 74:
+              CRCtriplet(crc, next, -74);
+              FALLTHROUGH_INTENDED;
+            case 73:
+              CRCtriplet(crc, next, -73);
+              FALLTHROUGH_INTENDED;
+            case 72:
+              CRCtriplet(crc, next, -72);
+              FALLTHROUGH_INTENDED;
+            case 71:
+              CRCtriplet(crc, next, -71);
+              FALLTHROUGH_INTENDED;
+            case 70:
+              CRCtriplet(crc, next, -70);
+              FALLTHROUGH_INTENDED;
+            case 69:
+              CRCtriplet(crc, next, -69);
+              FALLTHROUGH_INTENDED;
+            case 68:
+              CRCtriplet(crc, next, -68);
+              FALLTHROUGH_INTENDED;
+            case 67:
+              CRCtriplet(crc, next, -67);
+              FALLTHROUGH_INTENDED;
+            case 66:
+              CRCtriplet(crc, next, -66);
+              FALLTHROUGH_INTENDED;
+            case 65:
+              CRCtriplet(crc, next, -65);
+              FALLTHROUGH_INTENDED;
+            case 64:
+              CRCtriplet(crc, next, -64);
+              FALLTHROUGH_INTENDED;
+            case 63:
+              CRCtriplet(crc, next, -63);
+              FALLTHROUGH_INTENDED;
+            case 62:
+              CRCtriplet(crc, next, -62);
+              FALLTHROUGH_INTENDED;
+            case 61:
+              CRCtriplet(crc, next, -61);
+              FALLTHROUGH_INTENDED;
+            case 60:
+              CRCtriplet(crc, next, -60);
+              FALLTHROUGH_INTENDED;
+            case 59:
+              CRCtriplet(crc, next, -59);
+              FALLTHROUGH_INTENDED;
+            case 58:
+              CRCtriplet(crc, next, -58);
+              FALLTHROUGH_INTENDED;
+            case 57:
+              CRCtriplet(crc, next, -57);
+              FALLTHROUGH_INTENDED;
+            case 56:
+              CRCtriplet(crc, next, -56);
+              FALLTHROUGH_INTENDED;
+            case 55:
+              CRCtriplet(crc, next, -55);
+              FALLTHROUGH_INTENDED;
+            case 54:
+              CRCtriplet(crc, next, -54);
+              FALLTHROUGH_INTENDED;
+            case 53:
+              CRCtriplet(crc, next, -53);
+              FALLTHROUGH_INTENDED;
+            case 52:
+              CRCtriplet(crc, next, -52);
+              FALLTHROUGH_INTENDED;
+            case 51:
+              CRCtriplet(crc, next, -51);
+              FALLTHROUGH_INTENDED;
+            case 50:
+              CRCtriplet(crc, next, -50);
+              FALLTHROUGH_INTENDED;
+            case 49:
+              CRCtriplet(crc, next, -49);
+              FALLTHROUGH_INTENDED;
+            case 48:
+              CRCtriplet(crc, next, -48);
+              FALLTHROUGH_INTENDED;
+            case 47:
+              CRCtriplet(crc, next, -47);
+              FALLTHROUGH_INTENDED;
+            case 46:
+              CRCtriplet(crc, next, -46);
+              FALLTHROUGH_INTENDED;
+            case 45:
+              CRCtriplet(crc, next, -45);
+              FALLTHROUGH_INTENDED;
+            case 44:
+              CRCtriplet(crc, next, -44);
+              FALLTHROUGH_INTENDED;
+            case 43:
+              CRCtriplet(crc, next, -43);
+              FALLTHROUGH_INTENDED;
+            case 42:
+              CRCtriplet(crc, next, -42);
+              FALLTHROUGH_INTENDED;
+            case 41:
+              CRCtriplet(crc, next, -41);
+              FALLTHROUGH_INTENDED;
+            case 40:
+              CRCtriplet(crc, next, -40);
+              FALLTHROUGH_INTENDED;
+            case 39:
+              CRCtriplet(crc, next, -39);
+              FALLTHROUGH_INTENDED;
+            case 38:
+              CRCtriplet(crc, next, -38);
+              FALLTHROUGH_INTENDED;
+            case 37:
+              CRCtriplet(crc, next, -37);
+              FALLTHROUGH_INTENDED;
+            case 36:
+              CRCtriplet(crc, next, -36);
+              FALLTHROUGH_INTENDED;
+            case 35:
+              CRCtriplet(crc, next, -35);
+              FALLTHROUGH_INTENDED;
+            case 34:
+              CRCtriplet(crc, next, -34);
+              FALLTHROUGH_INTENDED;
+            case 33:
+              CRCtriplet(crc, next, -33);
+              FALLTHROUGH_INTENDED;
+            case 32:
+              CRCtriplet(crc, next, -32);
+              FALLTHROUGH_INTENDED;
+            case 31:
+              CRCtriplet(crc, next, -31);
+              FALLTHROUGH_INTENDED;
+            case 30:
+              CRCtriplet(crc, next, -30);
+              FALLTHROUGH_INTENDED;
+            case 29:
+              CRCtriplet(crc, next, -29);
+              FALLTHROUGH_INTENDED;
+            case 28:
+              CRCtriplet(crc, next, -28);
+              FALLTHROUGH_INTENDED;
+            case 27:
+              CRCtriplet(crc, next, -27);
+              FALLTHROUGH_INTENDED;
+            case 26:
+              CRCtriplet(crc, next, -26);
+              FALLTHROUGH_INTENDED;
+            case 25:
+              CRCtriplet(crc, next, -25);
+              FALLTHROUGH_INTENDED;
+            case 24:
+              CRCtriplet(crc, next, -24);
+              FALLTHROUGH_INTENDED;
+            case 23:
+              CRCtriplet(crc, next, -23);
+              FALLTHROUGH_INTENDED;
+            case 22:
+              CRCtriplet(crc, next, -22);
+              FALLTHROUGH_INTENDED;
+            case 21:
+              CRCtriplet(crc, next, -21);
+              FALLTHROUGH_INTENDED;
+            case 20:
+              CRCtriplet(crc, next, -20);
+              FALLTHROUGH_INTENDED;
+            case 19:
+              CRCtriplet(crc, next, -19);
+              FALLTHROUGH_INTENDED;
+            case 18:
+              CRCtriplet(crc, next, -18);
+              FALLTHROUGH_INTENDED;
+            case 17:
+              CRCtriplet(crc, next, -17);
+              FALLTHROUGH_INTENDED;
+            case 16:
+              CRCtriplet(crc, next, -16);
+              FALLTHROUGH_INTENDED;
+            case 15:
+              CRCtriplet(crc, next, -15);
+              FALLTHROUGH_INTENDED;
+            case 14:
+              CRCtriplet(crc, next, -14);
+              FALLTHROUGH_INTENDED;
+            case 13:
+              CRCtriplet(crc, next, -13);
+              FALLTHROUGH_INTENDED;
+            case 12:
+              CRCtriplet(crc, next, -12);
+              FALLTHROUGH_INTENDED;
+            case 11:
+              CRCtriplet(crc, next, -11);
+              FALLTHROUGH_INTENDED;
+            case 10:
+              CRCtriplet(crc, next, -10);
+              FALLTHROUGH_INTENDED;
+            case 9:
+              CRCtriplet(crc, next, -9);
+              FALLTHROUGH_INTENDED;
+            case 8:
+              CRCtriplet(crc, next, -8);
+              FALLTHROUGH_INTENDED;
+            case 7:
+              CRCtriplet(crc, next, -7);
+              FALLTHROUGH_INTENDED;
+            case 6:
+              CRCtriplet(crc, next, -6);
+              FALLTHROUGH_INTENDED;
+            case 5:
+              CRCtriplet(crc, next, -5);
+              FALLTHROUGH_INTENDED;
+            case 4:
+              CRCtriplet(crc, next, -4);
+              FALLTHROUGH_INTENDED;
+            case 3:
+              CRCtriplet(crc, next, -3);
+              FALLTHROUGH_INTENDED;
+            case 2:
+              CRCtriplet(crc, next, -2);
+              FALLTHROUGH_INTENDED;
+            case 1:
+              CRCduplet(crc, next, -1); // the final triplet is actually only 2
+              //{ CombineCRC(); }
+              crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
+              if (--n > 0) {
+                crc1 = crc2 = 0;
+                block_size = 128;
+                // points to the first byte of the next block
+                next0 = next2 + 128;
+                next1 = next0 + 128; // from here on all blocks are 128 long
+                next2 = next1 + 128;
+              }
+              FALLTHROUGH_INTENDED;
+            case 0:;
+          } while (n > 0);
+      }
+      next = (const unsigned char*)next2;
+    }
+    uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
+    len = len & 7;
+    next += (count2 * 8);
+    switch (count2) {
+      case 27:
+        CRCsinglet(crc0, next, -27 * 8);
+        FALLTHROUGH_INTENDED;
+      case 26:
+        CRCsinglet(crc0, next, -26 * 8);
+        FALLTHROUGH_INTENDED;
+      case 25:
+        CRCsinglet(crc0, next, -25 * 8);
+        FALLTHROUGH_INTENDED;
+      case 24:
+        CRCsinglet(crc0, next, -24 * 8);
+        FALLTHROUGH_INTENDED;
+      case 23:
+        CRCsinglet(crc0, next, -23 * 8);
+        FALLTHROUGH_INTENDED;
+      case 22:
+        CRCsinglet(crc0, next, -22 * 8);
+        FALLTHROUGH_INTENDED;
+      case 21:
+        CRCsinglet(crc0, next, -21 * 8);
+        FALLTHROUGH_INTENDED;
+      case 20:
+        CRCsinglet(crc0, next, -20 * 8);
+        FALLTHROUGH_INTENDED;
+      case 19:
+        CRCsinglet(crc0, next, -19 * 8);
+        FALLTHROUGH_INTENDED;
+      case 18:
+        CRCsinglet(crc0, next, -18 * 8);
+        FALLTHROUGH_INTENDED;
+      case 17:
+        CRCsinglet(crc0, next, -17 * 8);
+        FALLTHROUGH_INTENDED;
+      case 16:
+        CRCsinglet(crc0, next, -16 * 8);
+        FALLTHROUGH_INTENDED;
+      case 15:
+        CRCsinglet(crc0, next, -15 * 8);
+        FALLTHROUGH_INTENDED;
+      case 14:
+        CRCsinglet(crc0, next, -14 * 8);
+        FALLTHROUGH_INTENDED;
+      case 13:
+        CRCsinglet(crc0, next, -13 * 8);
+        FALLTHROUGH_INTENDED;
+      case 12:
+        CRCsinglet(crc0, next, -12 * 8);
+        FALLTHROUGH_INTENDED;
+      case 11:
+        CRCsinglet(crc0, next, -11 * 8);
+        FALLTHROUGH_INTENDED;
+      case 10:
+        CRCsinglet(crc0, next, -10 * 8);
+        FALLTHROUGH_INTENDED;
+      case 9:
+        CRCsinglet(crc0, next, -9 * 8);
+        FALLTHROUGH_INTENDED;
+      case 8:
+        CRCsinglet(crc0, next, -8 * 8);
+        FALLTHROUGH_INTENDED;
+      case 7:
+        CRCsinglet(crc0, next, -7 * 8);
+        FALLTHROUGH_INTENDED;
+      case 6:
+        CRCsinglet(crc0, next, -6 * 8);
+        FALLTHROUGH_INTENDED;
+      case 5:
+        CRCsinglet(crc0, next, -5 * 8);
+        FALLTHROUGH_INTENDED;
+      case 4:
+        CRCsinglet(crc0, next, -4 * 8);
+        FALLTHROUGH_INTENDED;
+      case 3:
+        CRCsinglet(crc0, next, -3 * 8);
+        FALLTHROUGH_INTENDED;
+      case 2:
+        CRCsinglet(crc0, next, -2 * 8);
+        FALLTHROUGH_INTENDED;
+      case 1:
+        CRCsinglet(crc0, next, -1 * 8);
+        FALLTHROUGH_INTENDED;
+      case 0:;
+    }
+  }
+  {
+    align_to_8(len, crc0, next);
+    return (uint32_t)crc0 ^ 0xffffffffu;
+  }
+}
+
+#endif //HAVE_SSE42 && HAVE_PCLMUL
+
+static inline Function Choose_Extend() {
+#ifndef HAVE_POWER8
+  if (isSSE42()) {
+    if (isPCLMULQDQ()) {
+#if defined HAVE_SSE42  && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
+      return crc32c_3way;
+#else
+    return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself
+#endif
+    }
+    else {  // no runtime PCLMULQDQ support but has SSE42 support
+      return ExtendImpl<Fast_CRC32>;
+    }
+  } // end of isSSE42()
+  else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#else  //HAVE_POWER8
+  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#endif
+}
+
+static Function ChosenExtend = Choose_Extend();
+uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
+  return ChosenExtend(crc, buf, size);
+}
+
+
+}  // namespace crc32c
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/crc32c.h b/src/rocksdb/util/crc32c.h
new file mode 100644
index 00000000..faee5d54
--- /dev/null
+++ b/src/rocksdb/util/crc32c.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+
+namespace rocksdb {
+namespace crc32c {
+
+extern std::string IsFastCrc32Supported();
+
+// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
+// crc32c of some string A.  Extend() is often used to maintain the
+// crc32c of a stream of data.
+extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
+
+// Return the crc32c of data[0,n-1]
+inline uint32_t Value(const char* data, size_t n) {
+  return Extend(0, data, n);
+}
+
+static const uint32_t kMaskDelta = 0xa282ead8ul;
+
+// Return a masked representation of crc.
+//
+// Motivation: it is problematic to compute the CRC of a string that
+// contains embedded CRCs.  Therefore we recommend that CRCs stored
+// somewhere (e.g., in files) should be masked before being stored.
+inline uint32_t Mask(uint32_t crc) {
+  // Rotate right by 15 bits and add a constant.
+  return ((crc >> 15) | (crc << 17)) + kMaskDelta;
+}
+
+// Return the crc whose masked representation is masked_crc.
+inline uint32_t Unmask(uint32_t masked_crc) {
+  uint32_t rot = masked_crc - kMaskDelta;
+  return ((rot >> 17) | (rot << 15));
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/crc32c_ppc.c b/src/rocksdb/util/crc32c_ppc.c
new file mode 100644
index 00000000..3c517c88
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc.c
@@ -0,0 +1,95 @@
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#define CRC_TABLE
+#include <inttypes.h>
+#include <stdlib.h>
+#include <strings.h>
+#include "util/crc32c_ppc_constants.h"
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN - 1)
+
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+                                unsigned long len) {
+  while (len--) crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+  return crc;
+}
+#endif
+
+#ifdef HAVE_POWER8
+unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
+                            unsigned long len);
+
+static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
+                             unsigned len) {
+  unsigned int prealign;
+  unsigned int tail;
+
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+    crc = crc32_align(crc, data, (unsigned long)len);
+    goto out;
+  }
+
+  if ((unsigned long)data & VMX_ALIGN_MASK) {
+    prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK);
+    crc = crc32_align(crc, data, prealign);
+    len -= prealign;
+    data += prealign;
+  }
+
+  crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK);
+
+  tail = len & VMX_ALIGN_MASK;
+  if (tail) {
+    data += len & ~VMX_ALIGN_MASK;
+    crc = crc32_align(crc, data, tail);
+  }
+
+out:
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  return crc;
+}
+
+/* This wrapper function works around the fact that crc32_vpmsum
+ * does not gracefully handle the case where the data pointer is NULL.  There
+ * may be room for performance improvement here.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+  unsigned char *buf2;
+
+  if (!data) {
+    buf2 = (unsigned char *)malloc(len);
+    bzero(buf2, len);
+    crc = crc32_vpmsum(crc, buf2, len);
+    free(buf2);
+  } else {
+    crc = crc32_vpmsum(crc, data, (unsigned long)len);
+  }
+  return crc;
+}
+
+#else /* HAVE_POWER8 */
+
+/* This symbol has to exist on non-ppc architectures (and on legacy
+ * ppc systems using power7 or below) in order to compile properly
+ * there, even though it won't be called.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+  return 0;
+}
+
+#endif /* HAVE_POWER8 */
diff --git a/src/rocksdb/util/crc32c_ppc.h b/src/rocksdb/util/crc32c_ppc.h
new file mode 100644
index 00000000..3bcaecfe
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc.h
@@ -0,0 +1,20 @@
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
+                           unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/rocksdb/util/crc32c_ppc_asm.S b/src/rocksdb/util/crc32c_ppc_asm.S
new file mode 100644
index 00000000..6de79797
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc_asm.S
@@ -0,0 +1,753 @@
+//  Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#include <ppc-asm.h>
+#include "ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+#define __ASSEMBLY__
+#include "crc32c_ppc_constants.h"
+
+	.text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(__crc32_vpmsum)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, r3)
+#ifdef REFLECT
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+#ifdef REFLECT
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+#endif
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+#endif
+
+	vand	v0,v0,mask_64bit
+
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
+
+	/* Get it into r3 */
+	MFVRD(r3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(__crc32_vpmsum)
diff --git a/src/rocksdb/util/crc32c_ppc_constants.h b/src/rocksdb/util/crc32c_ppc_constants.h
new file mode 100644
index 00000000..57d66303
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc_constants.h
@@ -0,0 +1,901 @@
+//  Copyright (C) 2015, 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#pragma once
+
+#define CRC 0x1edc6f41
+#define REFLECT
+#define CRC_XOR
+
+#ifndef __ASSEMBLY__
+#ifdef CRC_TABLE
+static const unsigned int crc_table[] = {
+    0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
+    0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+    0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c,
+    0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+    0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc,
+    0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+    0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512,
+    0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+    0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad,
+    0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+    0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf,
+    0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+    0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f,
+    0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+    0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f,
+    0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+    0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e,
+    0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+    0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e,
+    0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+    0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de,
+    0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+    0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4,
+    0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+    0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b,
+    0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+    0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5,
+    0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+    0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975,
+    0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+    0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905,
+    0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+    0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8,
+    0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+    0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8,
+    0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+    0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78,
+    0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+    0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6,
+    0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+    0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69,
+    0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+    0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
+};
+
+#endif
+
+#else
+#define MAX_SIZE 32768
+.constants :
+
+        /* Reduce 262144 kbits to 1024 bits */
+        /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+        .octa 0x00000000b6ca9e20000000009c37c408
+
+        /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+        .octa 0x00000000350249a800000001b51df26c
+
+        /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+        .octa 0x00000001862dac54000000000724b9d0
+
+        /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+        .octa 0x00000001d87fb48c00000001c00532fe
+
+        /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+        .octa 0x00000001f39b699e00000000f05a9362
+
+        /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+        .octa 0x0000000101da11b400000001e1007970
+
+        /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+        .octa 0x00000001cab571e000000000a57366ee
+
+        /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+        .octa 0x00000000c7020cfe0000000192011284
+
+        /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+        .octa 0x00000000cdaed1ae0000000162716d9a
+
+        /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+        .octa 0x00000001e804effc00000000cd97ecde
+
+        /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+        .octa 0x0000000077c3ea3a0000000058812bc0
+
+        /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+        .octa 0x0000000068df31b40000000088b8c12e
+
+        /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+        .octa 0x00000000b059b6c200000001230b234c
+
+        /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+        .octa 0x0000000145fb8ed800000001120b416e
+
+        /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+        .octa 0x00000000cbc0916800000001974aecb0
+
+        /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+        .octa 0x000000005ceeedc2000000008ee3f226
+
+        /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+        .octa 0x0000000047d74e8600000001089aba9a
+
+        /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+        .octa 0x00000001407e9e220000000065113872
+
+        /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+        .octa 0x00000001da967bda000000005c07ec10
+
+        /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+        .octa 0x000000006c8983680000000187590924
+
+        /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+        .octa 0x00000000f2d14c9800000000e35da7c6
+
+        /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+        .octa 0x00000001993c6ad4000000000415855a
+
+        /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+        .octa 0x000000014683d1ac0000000073617758
+
+        /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+        .octa 0x00000001a7c93e6c0000000176021d28
+
+        /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+        .octa 0x000000010211e90a00000001c358fd0a
+
+        /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+        .octa 0x000000001119403e00000001ff7a2c18
+
+        /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+        .octa 0x000000001c3261aa00000000f2d9f7e4
+
+        /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+        .octa 0x000000014e37a634000000016cf1f9c8
+
+        /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+        .octa 0x0000000073786c0c000000010af9279a
+
+        /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+        .octa 0x000000011dc037f80000000004f101e8
+
+        /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+        .octa 0x0000000031433dfc0000000070bcf184
+
+        /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+        .octa 0x000000009cde8348000000000a8de642
+
+        /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+        .octa 0x0000000038d3c2a60000000062ea130c
+
+        /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+        .octa 0x000000011b25f26000000001eb31cbb2
+
+        /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+        .octa 0x000000001629e6f00000000170783448
+
+        /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+        .octa 0x0000000160838b4c00000001a684b4c6
+
+        /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+        .octa 0x000000007a44011c00000000253ca5b4
+
+        /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+        .octa 0x00000000226f417a0000000057b4b1e2
+
+        /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+        .octa 0x0000000045eb2eb400000000b6bd084c
+
+        /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+        .octa 0x000000014459d70c0000000123c2d592
+
+        /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+        .octa 0x00000001d406ed8200000000159dafce
+
+        /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+        .octa 0x0000000160c8e1a80000000127e1a64e
+
+        /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+        .octa 0x0000000027ba80980000000056860754
+
+        /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+        .octa 0x000000006d92d01800000001e661aae8
+
+        /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+        .octa 0x000000012ed7e3f200000000f82c6166
+
+        /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+        .octa 0x000000002dc8778800000000c4f9c7ae
+
+        /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+        .octa 0x0000000018240bb80000000074203d20
+
+        /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+        .octa 0x000000001ad381580000000198173052
+
+        /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+        .octa 0x00000001396b78f200000001ce8aba54
+
+        /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+        .octa 0x000000011a68133400000001850d5d94
+
+        /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+        .octa 0x000000012104732e00000001d609239c
+
+        /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+        .octa 0x00000000a140d90c000000001595f048
+
+        /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+        .octa 0x00000001b7215eda0000000042ccee08
+
+        /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+        .octa 0x00000001aaf1df3c000000010a389d74
+
+        /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+        .octa 0x0000000029d15b8a000000012a840da6
+
+        /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+        .octa 0x00000000f1a96922000000001d181c0c
+
+        /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+        .octa 0x00000001ac80d03c0000000068b7d1f6
+
+        /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+        .octa 0x000000000f11d56a000000005b0f14fc
+
+        /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+        .octa 0x00000001f1c022a20000000179e9e730
+
+        /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+        .octa 0x0000000173d00ae200000001ce1368d6
+
+        /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+        .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+        /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+        .octa 0x000000016edc5ae400000000de940fee
+
+        /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+        .octa 0x00000001f1a0214000000000fe896b7e
+
+        /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+        .octa 0x00000000ca0b28a000000001f797431c
+
+        /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+        .octa 0x00000001928e30a20000000053e989ba
+
+        /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+        .octa 0x0000000097b1b002000000003920cd16
+
+        /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+        .octa 0x00000000b15bf90600000001e6f579b8
+
+        /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+        .octa 0x00000000411c5d52000000007493cb0a
+
+        /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+        .octa 0x00000001c36f330000000001bdd376d8
+
+        /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+        .octa 0x00000001119227e0000000016badfee6
+
+        /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+        .octa 0x00000000114d47020000000071de5c58
+
+        /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+        .octa 0x00000000458b5b9800000000453f317c
+
+        /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+        .octa 0x000000012e31fb8e0000000121675cce
+
+        /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+        .octa 0x000000005cf619d800000001f409ee92
+
+        /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+        .octa 0x0000000063f4d8b200000000f36b9c88
+
+        /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+        .octa 0x000000004138dc8a0000000036b398f4
+
+        /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+        .octa 0x00000001d29ee8e000000001748f9adc
+
+        /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+        .octa 0x000000006a08ace800000001be94ec00
+
+        /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+        .octa 0x0000000127d4201000000000b74370d6
+
+        /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+        .octa 0x0000000019d76b6200000001174d0b98
+
+        /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+        .octa 0x00000001b1471f6e00000000befc06a4
+
+        /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+        .octa 0x00000001f64c19cc00000001ae125288
+
+        /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+        .octa 0x00000000003c0ea00000000095c19b34
+
+        /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+        .octa 0x000000014d73abf600000001a78496f2
+
+        /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+        .octa 0x00000001620eb84400000001ac5390a0
+
+        /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+        .octa 0x0000000147655048000000002a80ed6e
+
+        /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+        .octa 0x0000000067b5077e00000001fa9b0128
+
+        /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+        .octa 0x0000000010ffe20600000001ea94929e
+
+        /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+        .octa 0x000000000fee8f1e0000000125f4305c
+
+        /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+        .octa 0x00000001da26fbae00000001471e2002
+
+        /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+        .octa 0x00000001b3a8bd880000000132d2253a
+
+        /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+        .octa 0x00000000e8f3898e00000000f26b3592
+
+        /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+        .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+        /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+        .octa 0x0000000030f2a798000000013a826ef2
+
+        /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+        .octa 0x000000000fba10020000000081482c84
+
+        /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+        .octa 0x00000000bdb9bd7200000000e77307c2
+
+        /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+        .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+        /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+        .octa 0x00000000ef1f98a00000000017102100
+
+        /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+        .octa 0x00000000689c760200000000db406486
+
+        /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+        .octa 0x000000016d5fa5fe0000000192db7f88
+
+        /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+        .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+        /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+        .octa 0x0000000041e7b470000000007c09163e
+
+        /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+        .octa 0x00000001cbb6495e000000000adac060
+
+        /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+        .octa 0x000000010052a0b000000000bd8316ae
+
+        /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+        .octa 0x00000001d8effb5c000000019f09ab54
+
+        /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+        .octa 0x00000001d969853c0000000125155542
+
+        /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+        .octa 0x00000000523ccce2000000018fdb5882
+
+        /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+        .octa 0x000000001e2436bc00000000e794b3f4
+
+        /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+        .octa 0x00000000ddd1c3a2000000016f9bb022
+
+        /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+        .octa 0x0000000019fcfe3800000000290c9978
+
+        /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+        .octa 0x00000001ce95db640000000083c0f350
+
+        /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+        .octa 0x00000000af5828060000000173ea6628
+
+        /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+        .octa 0x00000001006388f600000001c8b4e00a
+
+        /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+        .octa 0x0000000179eca00a00000000de95d6aa
+
+        /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+        .octa 0x0000000122410a6a000000010b7f7248
+
+        /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+        .octa 0x000000004288e87c00000001326e3a06
+
+        /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+        .octa 0x000000016c5490da00000000bb62c2e6
+
+        /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+        .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+        /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+        .octa 0x00000001b4ce08a6000000011dfe763a
+
+        /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+        .octa 0x00000001466ba60c000000007bcca8e2
+
+        /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+        .octa 0x00000001f6c488a40000000186118faa
+
+        /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+        .octa 0x000000013bfb06820000000111a65a88
+
+        /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+        .octa 0x00000000690e9e54000000003565e1c4
+
+        /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+        .octa 0x00000000281346b6000000012ed02a82
+
+        /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+        .octa 0x000000015646402400000000c486ecfc
+
+        /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+        .octa 0x000000016063a8dc0000000001b951b2
+
+        /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+        .octa 0x0000000116a663620000000048143916
+
+        /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+        .octa 0x000000017e8aa4d200000001dc2ae124
+
+        /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+        .octa 0x00000001728eb10c00000001416c58d6
+
+        /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+        .octa 0x00000001b08fd7fa00000000a479744a
+
+        /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+        .octa 0x00000001092a16e80000000096ca3a26
+
+        /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+        .octa 0x00000000a505637c00000000ff223d4e
+
+        /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+        .octa 0x00000000d94869b2000000010e84da42
+
+        /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+        .octa 0x00000001c8b203ae00000001b61ba3d0
+
+        /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+        .octa 0x000000005704aea000000000680f2de8
+
+        /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+        .octa 0x000000012e295fa2000000008772a9a8
+
+        /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+        .octa 0x000000011d0908bc0000000155f295bc
+
+        /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+        .octa 0x0000000193ed97ea00000000595f9282
+
+        /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+        .octa 0x000000013a0f1c520000000164b1c25a
+
+        /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+        .octa 0x000000010c2c40c000000000fbd67c50
+
+        /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+        .octa 0x00000000ff6fac3e0000000096076268
+
+        /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+        .octa 0x000000017b3609c000000001d288e4cc
+
+        /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+        .octa 0x0000000088c8c92200000001eaac1bdc
+
+        /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+        .octa 0x00000001751baae600000001f1ea39e2
+
+        /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+        .octa 0x000000010795297200000001eb6506fc
+
+        /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+        .octa 0x0000000162b00abe000000010f806ffe
+
+        /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+        .octa 0x000000000d7b404c000000010408481e
+
+        /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+        .octa 0x00000000763b13d40000000188260534
+
+        /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+        .octa 0x00000000f6dc22d80000000058fc73e0
+
+        /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+        .octa 0x000000007daae06000000000391c59b8
+
+        /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+        .octa 0x000000013359ab7c000000018b638400
+
+        /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+        .octa 0x000000008add438a000000011738f5c4
+
+        /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+        .octa 0x00000001edbefdea000000008cf7c6da
+
+        /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+        .octa 0x000000004104e0f800000001ef97fb16
+
+        /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+        .octa 0x00000000b48a82220000000102130e20
+
+        /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+        .octa 0x00000001bcb4684400000000db968898
+
+        /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+        .octa 0x000000013293ce0a00000000b5047b5e
+
+        /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+        .octa 0x00000001710d0844000000010b90fdb2
+
+        /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+        .octa 0x0000000117907f6e000000004834a32e
+
+        /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+        .octa 0x0000000087ddf93e0000000059c8f2b0
+
+        /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+        .octa 0x000000005970e9b00000000122cec508
+
+        /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+        .octa 0x0000000185b2b7d0000000000a330cda
+
+        /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+        .octa 0x00000001dcee0efc000000014a47148c
+
+        /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+        .octa 0x0000000030da27220000000042c61cb8
+
+        /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+        .octa 0x000000012f925a180000000012fe6960
+
+        /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+        .octa 0x00000000dd2e357c00000000dbda2c20
+
+        /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+        .octa 0x00000000071c80de000000011122410c
+
+        /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+        .octa 0x000000011513140a00000000977b2070
+
+        /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+        .octa 0x00000001df876e8e000000014050438e
+
+        /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+        .octa 0x000000015f81d6ce0000000147c840e8
+
+        /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+        .octa 0x000000019dd94dbe00000001cc7c88ce
+
+        /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+        .octa 0x00000001373d206e00000001476b35a4
+
+        /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+        .octa 0x00000000668ccade000000013d52d508
+
+        /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+        .octa 0x00000001b192d268000000008e4be32e
+
+        /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+        .octa 0x00000000e30f3a7800000000024120fe
+
+        /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+        .octa 0x000000010ef1f7bc00000000ddecddb4
+
+        /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+        .octa 0x00000001f5ac738000000000d4d403bc
+
+        /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+        .octa 0x000000011822ea7000000001734b89aa
+
+        /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+        .octa 0x00000000c3a33848000000010e7a58d6
+
+        /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+        .octa 0x00000001bd151c2400000001f9f04e9c
+
+        /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+        .octa 0x0000000056002d7600000000b692225e
+
+        /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+        .octa 0x000000014657c4f4000000019b8d3f3e
+
+        /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+        .octa 0x0000000113742d7c00000001a874f11e
+
+        /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+        .octa 0x000000019c5920ba000000010d5a4254
+
+        /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+        .octa 0x000000005216d2d600000000bbb2f5d6
+
+        /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+        .octa 0x0000000136f5ad8a0000000179cc0e36
+
+        /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+        .octa 0x000000018b07beb600000001dca1da4a
+
+        /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+        .octa 0x00000000db1e93b000000000feb1a192
+
+        /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+        .octa 0x000000000b96fa3a00000000d1eeedd6
+
+        /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+        .octa 0x00000001d9968af0000000008fad9bb4
+
+        /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+        .octa 0x000000000e4a77a200000001884938e4
+
+        /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+        .octa 0x00000000508c2ac800000001bc2e9bc0
+
+        /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+        .octa 0x0000000021572a8000000001f9658a68
+
+        /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+        .octa 0x00000001b859daf2000000001b9224fc
+
+        /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+        .octa 0x000000016f7884740000000055b2fb84
+
+        /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+        .octa 0x00000001b438810e000000018b090348
+
+        /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+        .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+        /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+        .octa 0x00000001d977c20c0000000007ae47f8
+
+        /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+        .octa 0x00000000ebedb99a0000000172acbec0
+
+        /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+        .octa 0x00000001df9e9e9200000001c6e3ff20
+
+        /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+        .octa 0x00000001a4a3f95200000000e1b38744
+
+        /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+        .octa 0x00000000e2f5122000000000791585b2
+
+        /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+        .octa 0x000000004aa01f3e00000000ac53b894
+
+        /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+        .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+        /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+        .octa 0x000000000c9ca2aa00000001df48b2e0
+
+        /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+        .octa 0x000000015168231600000000049c1c62
+
+        /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+        .octa 0x0000000036fce78c000000017c460c12
+
+        /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+        .octa 0x000000009037dc10000000015be4da7e
+
+        /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+        .octa 0x00000000d3298582000000010f38f668
+
+        /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+        .octa 0x00000001b42e8ad60000000039f40a00
+
+        /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+        .octa 0x00000000142a983800000000bd4c10c4
+
+        /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+        .octa 0x0000000109c7f1900000000042db1d98
+
+        /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+        .octa 0x0000000056ff931000000001c905bae6
+
+        /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+        .octa 0x00000001594513aa00000000069d40ea
+
+        /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+        .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+        /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+        .octa 0x000000011dd5fc080000000047bedd46
+
+        /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+        .octa 0x00000001675f0cc20000000026396bf8
+
+        /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+        .octa 0x00000000d1c8dd4400000000379beb92
+
+        /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+        .octa 0x0000000115ebd3d8000000000abae54a
+
+        /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+        .octa 0x00000001ecbd0dac0000000007e6a128
+
+        /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+        .octa 0x00000000cdf67af2000000000ade29d2
+
+        /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+        .octa 0x000000004c01ff4c00000000f974c45c
+
+        /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+        .octa 0x00000000f2d8657e00000000e77ac60a
+
+        /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+        .octa 0x000000006bae74c40000000145895816
+
+        /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+        .octa 0x0000000152af8aa00000000038e362be
+
+        /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+        .octa 0x0000000004663802000000007f991a64
+
+        /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+        .octa 0x00000001ab2f5afc00000000fa366d3a
+
+        /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+        .octa 0x0000000074a4ebd400000001a2bb34f0
+
+        /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+        .octa 0x00000001d7ab3a4c0000000028a9981e
+
+        /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+        .octa 0x00000001a8da60c600000001dbc672be
+
+        /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+        .octa 0x000000013cf6382000000000b04d77f6
+
+        /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+        .octa 0x00000000bec12e1e0000000124400d96
+
+        /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+        .octa 0x00000001c6368010000000014ca4b414
+
+        /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+        .octa 0x00000001e6e78758000000012fe2c938
+
+        /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+        .octa 0x000000008d7f2b3c00000001faed01e6
+
+        /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+        .octa 0x000000016b4a156e000000007e80ecfe
+
+        /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+        .octa 0x00000001c63cfeb60000000098daee94
+
+        /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+        .octa 0x000000015f902670000000010a04edea
+
+        /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+        .octa 0x00000001cd5de11e00000001c00b4524
+
+        /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+        .octa 0x000000001acaec540000000170296550
+
+        /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+        .octa 0x000000002bd0ca780000000181afaa48
+
+        /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+        .octa 0x0000000032d63d5c0000000185a31ffa
+
+        /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+        .octa 0x000000001c6d4e4c000000002469f608
+
+        /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+        .octa 0x0000000106a60b92000000006980102a
+
+        /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+        .octa 0x00000000d3855e120000000111ea9ca8
+
+        /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+        .octa 0x00000000e312563600000001bd1d29ce
+
+        /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+        .octa 0x000000009e8f7ea400000001b34b9580
+
+        /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+        .octa 0x00000001c82e562c000000003076054e
+
+        /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+        .octa 0x00000000ca9f09ce000000012a608ea4
+
+        /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+        .octa 0x00000000c63764e600000000784d05fe
+
+        /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+        .octa 0x0000000168d2e49e000000016ef0d82a
+
+        /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+        .octa 0x00000000e986c1480000000075bda454
+
+        /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+        .octa 0x00000000cfb65894000000003dc0a1c4
+
+        /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+        .octa 0x0000000111cadee400000000e9a5d8be
+
+        /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+        .octa 0x0000000171fb63ce00000001609bc4b4
+
+        .short_constants :
+
+        /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include
+           the trailing 32 bits of zeros */
+        /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod
+           p(x)` */
+        .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+        /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod
+           p(x)` */
+        .octa 0x38e888d4844752a9963a18920246e2e6
+
+        /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod
+           p(x)` */
+        .octa 0x42316c00730206ad419a441956993a31
+
+        /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod
+           p(x)` */
+        .octa 0x543d5c543e65ddf9924752ba2b830011
+
+        /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod
+           p(x)` */
+        .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+        /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod
+           p(x)` */
+        .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+        /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod
+           p(x)` */
+        .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+        /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod
+           p(x)` */
+        .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+        /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)`
+         */
+        .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+        /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+        .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+        /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+        .octa 0x041d37768cd75659817cdc5119b29a35
+
+        /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+        .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+        /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+        .octa 0x0e148e8252377a554f256efcb82be955
+
+        /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+        .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+        /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+        .octa 0x790606ff9957c0a65d27e147510ac59a
+
+        /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+        .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+        .barrett_constants :
+        /* 33 bit reflected Barrett constant m - (4^32)/n */
+        .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+        /* 33 bit reflected Barrett constant n */
+        .octa 0x00000000000000000000000105ec76f1
+#endif
diff --git a/src/rocksdb/util/crc32c_test.cc b/src/rocksdb/util/crc32c_test.cc
new file mode 100644
index 00000000..d5983586
--- /dev/null
+++ b/src/rocksdb/util/crc32c_test.cc
@@ -0,0 +1,176 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "util/crc32c.h"
+#include "util/testharness.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+namespace crc32c {
+
+class CRC { };
+
+
+// Tests for 3-way crc32c algorithm. We need these tests because it uses
+// different lookup tables than the original Fast_CRC32
+const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t);
+char buffer[BUFFER_SIZE];
+
+struct ExpectedResult {
+  size_t offset;
+  size_t length;
+  uint32_t crc32c;
+};
+
+ExpectedResult expectedResults[] = {
+    // Zero-byte input
+    { 0, 0, ~0U },
+    // Small aligned inputs to test special cases in SIMD implementations
+    { 8, 1, 1543413366 },
+    { 8, 2, 523493126 },
+    { 8, 3, 1560427360 },
+    { 8, 4, 3422504776 },
+    { 8, 5, 447841138 },
+    { 8, 6, 3910050499 },
+    { 8, 7, 3346241981 },
+    // Small unaligned inputs
+    { 9, 1, 3855826643 },
+    { 10, 2, 560880875 },
+    { 11, 3, 1479707779 },
+    { 12, 4, 2237687071 },
+    { 13, 5, 4063855784 },
+    { 14, 6, 2553454047 },
+    { 15, 7, 1349220140 },
+    // Larger inputs to test leftover chunks at the end of aligned blocks
+    { 8, 8, 627613930 },
+    { 8, 9, 2105929409 },
+    { 8, 10, 2447068514 },
+    { 8, 11, 863807079 },
+    { 8, 12, 292050879 },
+    { 8, 13, 1411837737 },
+    { 8, 14, 2614515001 },
+    { 8, 15, 3579076296 },
+    { 8, 16, 2897079161 },
+    { 8, 17, 675168386 },
+    // // Much larger inputs
+    { 0, BUFFER_SIZE, 2096790750 },
+    { 1, BUFFER_SIZE / 2, 3854797577 },
+
+};
+
+TEST(CRC, StandardResults) {
+
+  // Original Fast_CRC32 tests.
+  // From rfc3720 section B.4.
+  char buf[32];
+
+  memset(buf, 0, sizeof(buf));
+  ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf)));
+
+  memset(buf, 0xff, sizeof(buf));
+  ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = static_cast<char>(i);
+  }
+  ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = static_cast<char>(31 - i);
+  }
+  ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf)));
+
+  unsigned char data[48] = {
+    0x01, 0xc0, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x04, 0x00,
+    0x00, 0x00, 0x00, 0x14,
+    0x00, 0x00, 0x00, 0x18,
+    0x28, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00,
+  };
+  ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+
+  // 3-Way Crc32c tests ported from folly.
+  // Test 1: single computation
+  for (auto expected : expectedResults) {
+    uint32_t result = Value(buffer + expected.offset, expected.length);
+    EXPECT_EQ(~expected.crc32c, result);
+  }
+
+  // Test 2: stitching two computations
+  for (auto expected : expectedResults) {
+    size_t partialLength = expected.length / 2;
+    uint32_t partialChecksum = Value(buffer + expected.offset, partialLength);
+    uint32_t result = Extend(partialChecksum,
+        buffer + expected.offset + partialLength,
+        expected.length - partialLength);
+    EXPECT_EQ(~expected.crc32c, result);
+  }
+
+}
+
+TEST(CRC, Values) {
+  ASSERT_NE(Value("a", 1), Value("foo", 3));
+}
+
+TEST(CRC, Extend) {
+  ASSERT_EQ(Value("hello world", 11),
+            Extend(Value("hello ", 6), "world", 5));
+}
+
+TEST(CRC, Mask) {
+  uint32_t crc = Value("foo", 3);
+  ASSERT_NE(crc, Mask(crc));
+  ASSERT_NE(crc, Mask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
+}
+
+}  // namespace crc32c
+}  // namespace rocksdb
+
+// copied from folly
+const uint64_t FNV_64_HASH_START = 14695981039346656037ULL;
+inline uint64_t fnv64_buf(const void* buf,
+                          size_t n,
+                          uint64_t hash = FNV_64_HASH_START) {
+  // forcing signed char, since other platforms can use unsigned
+  const signed char* char_buf = reinterpret_cast<const signed char*>(buf);
+
+  for (size_t i = 0; i < n; ++i) {
+    hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) +
+      (hash << 8) + (hash << 40);
+    hash ^= char_buf[i];
+  }
+  return hash;
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  // Populate a buffer with a deterministic pattern
+  // on which to compute checksums
+
+  const uint8_t* src = (uint8_t*)rocksdb::crc32c::buffer;
+  uint64_t* dst = (uint64_t*)rocksdb::crc32c::buffer;
+  const uint64_t* end = (const uint64_t*)(rocksdb::crc32c::buffer + rocksdb::crc32c::BUFFER_SIZE);
+  *dst++ = 0;
+  while (dst < end) {
+    rocksdb::EncodeFixed64(reinterpret_cast<char*>(dst), fnv64_buf((const char*)src, sizeof(uint64_t)));
+    dst++;
+    src += sizeof(uint64_t);
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/delete_scheduler.cc b/src/rocksdb/util/delete_scheduler.cc
new file mode 100644
index 00000000..f5ee2844
--- /dev/null
+++ b/src/rocksdb/util/delete_scheduler.cc
@@ -0,0 +1,353 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "util/delete_scheduler.h"
+
+#include <thread>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/sst_file_manager_impl.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+DeleteScheduler::DeleteScheduler(Env* env, int64_t rate_bytes_per_sec,
+                                 Logger* info_log,
+                                 SstFileManagerImpl* sst_file_manager,
+                                 double max_trash_db_ratio,
+                                 uint64_t bytes_max_delete_chunk)
+    : env_(env),
+      total_trash_size_(0),
+      rate_bytes_per_sec_(rate_bytes_per_sec),
+      pending_files_(0),
+      bytes_max_delete_chunk_(bytes_max_delete_chunk),
+      closing_(false),
+      cv_(&mu_),
+      info_log_(info_log),
+      sst_file_manager_(sst_file_manager),
+      max_trash_db_ratio_(max_trash_db_ratio) {
+  assert(sst_file_manager != nullptr);
+  assert(max_trash_db_ratio >= 0);
+  bg_thread_.reset(
+      new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
+}
+
+DeleteScheduler::~DeleteScheduler() {
+  {
+    InstrumentedMutexLock l(&mu_);
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status DeleteScheduler::DeleteFile(const std::string& file_path,
+                                   const std::string& dir_to_sync,
+                                   const bool force_bg) {
+  Status s;
+  if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
+      total_trash_size_.load() >
+          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
+    // Rate limiting is disabled or trash size makes up more than
+    // max_trash_db_ratio_ (default 25%) of the total DB size
+    TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
+    s = env_->DeleteFile(file_path);
+    if (s.ok()) {
+      sst_file_manager_->OnDeleteFile(file_path);
+    }
+    return s;
+  }
+
+  // Move file to trash
+  std::string trash_file;
+  s = MarkAsTrash(file_path, &trash_file);
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash", file_path.c_str());
+    s = env_->DeleteFile(file_path);
+    if (s.ok()) {
+      sst_file_manager_->OnDeleteFile(file_path);
+    }
+    return s;
+  }
+
+  // Update the total trash size
+  uint64_t trash_file_size = 0;
+  env_->GetFileSize(trash_file, &trash_file_size);
+  total_trash_size_.fetch_add(trash_file_size);
+
+  // Add file to delete queue
+  {
+    InstrumentedMutexLock l(&mu_);
+    queue_.emplace(trash_file, dir_to_sync);
+    pending_files_++;
+    if (pending_files_ == 1) {
+      cv_.SignalAll();
+    }
+  }
+  return s;
+}
+
+std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
+  InstrumentedMutexLock l(&mu_);
+  return bg_errors_;
+}
+
+const std::string DeleteScheduler::kTrashExtension = ".trash";
+bool DeleteScheduler::IsTrashFile(const std::string& file_path) {
+  return (file_path.size() >= kTrashExtension.size() &&
+          file_path.rfind(kTrashExtension) ==
+              file_path.size() - kTrashExtension.size());
+}
+
+Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                         const std::string& path) {
+  Status s;
+  // Check if there are any files marked as trash in this path
+  std::vector<std::string> files_in_path;
+  s = env->GetChildren(path, &files_in_path);
+  if (!s.ok()) {
+    return s;
+  }
+  for (const std::string& current_file : files_in_path) {
+    if (!DeleteScheduler::IsTrashFile(current_file)) {
+      // not a trash file, skip
+      continue;
+    }
+
+    Status file_delete;
+    std::string trash_file = path + "/" + current_file;
+    if (sfm) {
+      // We have an SstFileManager that will schedule the file delete
+      sfm->OnAddFile(trash_file);
+      file_delete = sfm->ScheduleFileDeletion(trash_file, path);
+    } else {
+      // Delete the file immediately
+      file_delete = env->DeleteFile(trash_file);
+    }
+
+    if (s.ok() && !file_delete.ok()) {
+      s = file_delete;
+    }
+  }
+
+  return s;
+}
+
+Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
+                                    std::string* trash_file) {
+  // Sanity check of the path
+  size_t idx = file_path.rfind("/");
+  if (idx == std::string::npos || idx == file_path.size() - 1) {
+    return Status::InvalidArgument("file_path is corrupted");
+  }
+
+  Status s;
+  if (DeleteScheduler::IsTrashFile(file_path)) {
+    // This is already a trash file
+    *trash_file = file_path;
+    return s;
+  }
+
+  *trash_file = file_path + kTrashExtension;
+  // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
+  //             file_move_mu mutex.
+  int cnt = 0;
+  InstrumentedMutexLock l(&file_move_mu_);
+  while (true) {
+    s = env_->FileExists(*trash_file);
+    if (s.IsNotFound()) {
+      // We found a path for our file in trash
+      s = env_->RenameFile(file_path, *trash_file);
+      break;
+    } else if (s.ok()) {
+      // Name conflict, generate new random suffix
+      *trash_file = file_path + std::to_string(cnt) + kTrashExtension;
+    } else {
+      // Error during FileExists call, we cannot continue
+      break;
+    }
+    cnt++;
+  }
+  if (s.ok()) {
+    sst_file_manager_->OnMoveFile(file_path, *trash_file);
+  }
+  return s;
+}
+
+void DeleteScheduler::BackgroundEmptyTrash() {
+  TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash");
+
+  while (true) {
+    InstrumentedMutexLock l(&mu_);
+    while (queue_.empty() && !closing_) {
+      cv_.Wait();
+    }
+
+    if (closing_) {
+      return;
+    }
+
+    // Delete all files in queue_
+    uint64_t start_time = env_->NowMicros();
+    uint64_t total_deleted_bytes = 0;
+    int64_t current_delete_rate = rate_bytes_per_sec_.load();
+    while (!queue_.empty() && !closing_) {
+      if (current_delete_rate != rate_bytes_per_sec_.load()) {
+        // User changed the delete rate
+        current_delete_rate = rate_bytes_per_sec_.load();
+        start_time = env_->NowMicros();
+        total_deleted_bytes = 0;
+      }
+
+      // Get new file to delete
+      const FileAndDir& fad = queue_.front();
+      std::string path_in_trash = fad.fname;
+
+      // We dont need to hold the lock while deleting the file
+      mu_.Unlock();
+      uint64_t deleted_bytes = 0;
+      bool is_complete = true;
+      // Delete file from trash and update total_penlty value
+      Status s =
+          DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
+      total_deleted_bytes += deleted_bytes;
+      mu_.Lock();
+      if (is_complete) {
+        queue_.pop();
+      }
+
+      if (!s.ok()) {
+        bg_errors_[path_in_trash] = s;
+      }
+
+      // Apply penlty if necessary
+      uint64_t total_penlty;
+      if (current_delete_rate > 0) {
+        // rate limiting is enabled
+        total_penlty =
+            ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate);
+        while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {}
+      } else {
+        // rate limiting is disabled
+        total_penlty = 0;
+      }
+      TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
+                               &total_penlty);
+
+      if (is_complete) {
+        pending_files_--;
+      }
+      if (pending_files_ == 0) {
+        // Unblock WaitForEmptyTrash since there are no more files waiting
+        // to be deleted
+        cv_.SignalAll();
+      }
+    }
+  }
+}
+
+Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
+                                        const std::string& dir_to_sync,
+                                        uint64_t* deleted_bytes,
+                                        bool* is_complete) {
+  uint64_t file_size;
+  Status s = env_->GetFileSize(path_in_trash, &file_size);
+  *is_complete = true;
+  TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
+  if (s.ok()) {
+    bool need_full_delete = true;
+    if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
+      uint64_t num_hard_links = 2;
+      // We don't have to worry aobut data race between linking a new
+      // file after the number of file link check and ftruncte because
+      // the file is now in trash and no hardlink is supposed to create
+      // to trash files by RocksDB.
+      Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links);
+      if (my_status.ok()) {
+        if (num_hard_links == 1) {
+          std::unique_ptr<WritableFile> wf;
+          my_status =
+              env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions());
+          if (my_status.ok()) {
+            my_status = wf->Truncate(file_size - bytes_max_delete_chunk_);
+            if (my_status.ok()) {
+              TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
+              my_status = wf->Fsync();
+            }
+          }
+          if (my_status.ok()) {
+            *deleted_bytes = bytes_max_delete_chunk_;
+            need_full_delete = false;
+            *is_complete = false;
+          } else {
+            ROCKS_LOG_WARN(info_log_,
+                           "Failed to partially delete %s from trash -- %s",
+                           path_in_trash.c_str(), my_status.ToString().c_str());
+          }
+        } else {
+          ROCKS_LOG_INFO(info_log_,
+                         "Cannot delete %s slowly through ftruncate from trash "
+                         "as it has other links",
+                         path_in_trash.c_str());
+        }
+      } else if (!num_link_error_printed_) {
+        ROCKS_LOG_INFO(
+            info_log_,
+            "Cannot delete files slowly through ftruncate from trash "
+            "as Env::NumFileLinks() returns error: %s",
+            my_status.ToString().c_str());
+        num_link_error_printed_ = true;
+      }
+    }
+
+    if (need_full_delete) {
+      s = env_->DeleteFile(path_in_trash);
+      if (!dir_to_sync.empty()) {
+        std::unique_ptr<Directory> dir_obj;
+        if (s.ok()) {
+          s = env_->NewDirectory(dir_to_sync, &dir_obj);
+        }
+        if (s.ok()) {
+          s = dir_obj->Fsync();
+          TEST_SYNC_POINT_CALLBACK(
+              "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
+              reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
+        }
+      }
+      *deleted_bytes = file_size;
+      sst_file_manager_->OnDeleteFile(path_in_trash);
+    }
+  }
+  if (!s.ok()) {
+    // Error while getting file size or while deleting
+    ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s",
+                    path_in_trash.c_str(), s.ToString().c_str());
+    *deleted_bytes = 0;
+  } else {
+    total_trash_size_.fetch_sub(*deleted_bytes);
+  }
+
+  return s;
+}
+
+void DeleteScheduler::WaitForEmptyTrash() {
+  InstrumentedMutexLock l(&mu_);
+  while (pending_files_ > 0 && !closing_) {
+    cv_.Wait();
+  }
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/delete_scheduler.h b/src/rocksdb/util/delete_scheduler.h
new file mode 100644
index 00000000..29b70517
--- /dev/null
+++ b/src/rocksdb/util/delete_scheduler.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <queue>
+#include <string>
+#include <thread>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/port.h"
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Env;
+class Logger;
+class SstFileManagerImpl;
+
+// DeleteScheduler allows the DB to enforce a rate limit on file deletion,
+// Instead of deleteing files immediately, files are marked as trash
+// and deleted in a background thread that apply sleep penlty between deletes
+// if they are happening in a rate faster than rate_bytes_per_sec,
+//
+// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
+// case DeleteScheduler will delete files immediately.
+class DeleteScheduler {
+ public:
+  DeleteScheduler(Env* env, int64_t rate_bytes_per_sec, Logger* info_log,
+                  SstFileManagerImpl* sst_file_manager,
+                  double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
+
+  ~DeleteScheduler();
+
+  // Return delete rate limit in bytes per second
+  int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_.load(); }
+
+  // Set delete rate limit in bytes per second
+  void SetRateBytesPerSecond(int64_t bytes_per_sec) {
+    rate_bytes_per_sec_.store(bytes_per_sec);
+  }
+
+  // Mark file as trash directory and schedule it's deletion. If force_bg is
+  // set, it forces the file to always be deleted in the background thread,
+  // except when rate limiting is disabled
+  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+      const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  void WaitForEmptyTrash();
+
+  // Return a map containing errors that happened in BackgroundEmptyTrash
+  // file_path => error status
+  std::map<std::string, Status> GetBackgroundErrors();
+
+  uint64_t GetTotalTrashSize() { return total_trash_size_.load(); }
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  double GetMaxTrashDBRatio() {
+    return max_trash_db_ratio_.load();
+  }
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  void SetMaxTrashDBRatio(double r) {
+    assert(r >= 0);
+    max_trash_db_ratio_.store(r);
+  }
+
+  static const std::string kTrashExtension;
+  static bool IsTrashFile(const std::string& file_path);
+
+  // Check if there are any .trash filse in path, and schedule their deletion
+  // Or delete immediately if sst_file_manager is nullptr
+  static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                 const std::string& path);
+
+ private:
+  Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
+
+  Status DeleteTrashFile(const std::string& path_in_trash,
+                         const std::string& dir_to_sync,
+                         uint64_t* deleted_bytes, bool* is_complete);
+
+  void BackgroundEmptyTrash();
+
+  Env* env_;
+  // total size of trash files
+  std::atomic<uint64_t> total_trash_size_;
+  // Maximum number of bytes that should be deleted per second
+  std::atomic<int64_t> rate_bytes_per_sec_;
+  // Mutex to protect queue_, pending_files_, bg_errors_, closing_
+  InstrumentedMutex mu_;
+
+  struct FileAndDir {
+    FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
+    std::string fname;
+    std::string dir;  // empty will be skipped.
+  };
+
+  // Queue of trash files that need to be deleted
+  std::queue<FileAndDir> queue_;
+  // Number of trash files that are waiting to be deleted
+  int32_t pending_files_;
+  uint64_t bytes_max_delete_chunk_;
+  // Errors that happened in BackgroundEmptyTrash (file_path => error)
+  std::map<std::string, Status> bg_errors_;
+
+  bool num_link_error_printed_ = false;
+  // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
+  bool closing_;
+  // Condition variable signaled in these conditions
+  //    - pending_files_ value change from 0 => 1
+  //    - pending_files_ value change from 1 => 0
+  //    - closing_ value is set to true
+  InstrumentedCondVar cv_;
+  // Background thread running BackgroundEmptyTrash
+  std::unique_ptr<port::Thread> bg_thread_;
+  // Mutex to protect threads from file name conflicts
+  InstrumentedMutex file_move_mu_;
+  Logger* info_log_;
+  SstFileManagerImpl* sst_file_manager_;
+  // If the trash size constitutes for more than this fraction of the total DB
+  // size we will start deleting new files passed to DeleteScheduler
+  // immediately
+  std::atomic<double> max_trash_db_ratio_;
+  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/delete_scheduler_test.cc b/src/rocksdb/util/delete_scheduler_test.cc
new file mode 100644
index 00000000..0d8e354b
--- /dev/null
+++ b/src/rocksdb/util/delete_scheduler_test.cc
@@ -0,0 +1,696 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <atomic>
+#include <thread>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "util/delete_scheduler.h"
+#include "util/sst_file_manager_impl.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace rocksdb {
+
+class DeleteSchedulerTest : public testing::Test {
+ public:
+  DeleteSchedulerTest() : env_(Env::Default()) {
+    const int kNumDataDirs = 3;
+    dummy_files_dirs_.reserve(kNumDataDirs);
+    for (size_t i = 0; i < kNumDataDirs; ++i) {
+      dummy_files_dirs_.emplace_back(
+          test::PerThreadDBPath(env_, "delete_scheduler_dummy_data_dir") +
+          ToString(i));
+      DestroyAndCreateDir(dummy_files_dirs_.back());
+    }
+  }
+
+  ~DeleteSchedulerTest() override {
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({});
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    for (const auto& dummy_files_dir : dummy_files_dirs_) {
+      test::DestroyDir(env_, dummy_files_dir);
+    }
+  }
+
+  void DestroyAndCreateDir(const std::string& dir) {
+    ASSERT_OK(test::DestroyDir(env_, dir));
+    EXPECT_OK(env_->CreateDir(dir));
+  }
+
+  int CountNormalFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int normal_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") {
+        normal_cnt++;
+      }
+    }
+    return normal_cnt;
+  }
+
+  int CountTrashFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int trash_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (DeleteScheduler::IsTrashFile(f)) {
+        trash_cnt++;
+      }
+    }
+    return trash_cnt;
+  }
+
+  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
+                           size_t dummy_files_dirs_idx = 0) {
+    std::string file_path =
+        dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
+    std::unique_ptr<WritableFile> f;
+    env_->NewWritableFile(file_path, &f, EnvOptions());
+    std::string data(size, 'A');
+    EXPECT_OK(f->Append(data));
+    EXPECT_OK(f->Close());
+    sst_file_mgr_->OnAddFile(file_path, false);
+    return file_path;
+  }
+
+  void NewDeleteScheduler() {
+    // Tests in this file are for DeleteScheduler component and dont create any
+    // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
+    // 25%)
+    sst_file_mgr_.reset(
+        new SstFileManagerImpl(env_, nullptr, rate_bytes_per_sec_,
+                               /* max_trash_db_ratio= */ 1.1, 128 * 1024));
+    delete_scheduler_ = sst_file_mgr_->delete_scheduler();
+  }
+
+  Env* env_;
+  std::vector<std::string> dummy_files_dirs_;
+  int64_t rate_bytes_per_sec_;
+  DeleteScheduler* delete_scheduler_;
+  std::unique_ptr<SstFileManagerImpl> sst_file_mgr_;
+};
+
+// Test the basic functionality of DeleteScheduler (Rate Limiting).
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in trash
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BasicRateLimiting:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  int dir_synced = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) {
+        dir_synced++;
+        std::string* dir = reinterpret_cast<std::string*>(arg);
+        EXPECT_EQ(dummy_files_dirs_[0], *dir);
+      });
+
+  int num_files = 100;  // 100 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    rocksdb::SyncPoint::GetInstance()->ClearTrace();
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    dir_synced = 0;
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i],
+                                              dummy_files_dirs_[0]));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files);
+    for (int i = 0; i < num_files; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(num_files, dir_synced);
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  rate_bytes_per_sec_ = 1 << 20;  // 1MB
+  NewDeleteScheduler();
+
+  // Generate dummy files in multiple directories
+  const size_t kNumFiles = dummy_files_dirs_.size();
+  const size_t kFileSize = 1 << 10;  // 1KB
+  std::vector<std::string> generated_files;
+  for (size_t i = 0; i < kNumFiles; i++) {
+    generated_files.push_back(NewDummyFile("file", kFileSize, i));
+    ASSERT_EQ(1, CountNormalFiles(i));
+  }
+
+  // Mark dummy files as trash
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(1, CountTrashFiles(i));
+  }
+  TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1");
+  delete_scheduler_->WaitForEmptyTrash();
+
+  // Verify dummy files eventually got deleted
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(0, CountTrashFiles(i));
+  }
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Same as the BasicRateLimiting test but delete files in multiple threads.
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::RateLimitingMultiThreaded:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  int thread_cnt = 10;
+  int num_files = 10;  // 10 files per thread
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    rocksdb::SyncPoint::GetInstance()->ClearTrace();
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files using 10 threads and measure time spent to empty trash
+    std::atomic<int> thread_num(0);
+    std::vector<port::Thread> threads;
+    std::function<void()> delete_thread = [&]() {
+      int idx = thread_num.fetch_add(1);
+      int range_start = idx * num_files;
+      int range_end = range_start + num_files;
+      for (int j = range_start; j < range_end; j++) {
+        ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j], ""));
+      }
+    };
+
+    for (int i = 0; i < thread_cnt; i++) {
+      threads.emplace_back(delete_thread);
+    }
+
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i].join();
+    }
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::RateLimitingMultiThreaded:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files * thread_cnt);
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure
+// that when DeleteScheduler delete a file it delete it immediately and dont
+// move it to trash
+TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;
+  NewDeleteScheduler();
+
+  for (int i = 0; i < 10; i++) {
+    // Every file we delete will be deleted immediately
+    std::string dummy_file = NewDummyFile("dummy.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+    ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound());
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Testing that moving files to trash with the same name is not a problem
+// 1- Create 10 files with the same name "conflict.data"
+// 2- Delete the 10 files using DeleteScheduler
+// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that files are deleted from trash
+TEST_F(DeleteSchedulerTest, ConflictNames) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::ConflictNames:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Create "conflict.data" and move it to trash 10 times
+  for (int i = 0; i < 10; i++) {
+    std::string dummy_file = NewDummyFile("conflict.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  // 10 files ("conflict.data" x 10) in trash
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  ASSERT_EQ(CountTrashFiles(), 0);
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete the 10 files using DeleteScheduler (move them to trsah)
+// 3- Delete the 10 files directly (using env_->DeleteFile)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that DeleteScheduler failed to delete the 10 files and
+//    reported 10 background errors
+TEST_F(DeleteSchedulerTest, BackgroundError) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BackgroundError:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Generate 10 dummy files and move them to trash
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Delete 10 files from trash, this will cause background errors in
+  // BackgroundEmptyTrash since we already deleted the files it was
+  // goind to delete
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data.trash";
+    ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name));
+  }
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::BackgroundError:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 10);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete 10 dummy files using DeleteScheduler
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Make sure all files in trash directory were deleted
+// 5- Repeat previous steps 5 times
+TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // Move files to trash, wait for empty trash, start again
+  for (int run = 1; run <= 5; run++) {
+    // Generate 10 dummy files and move them to trash
+    for (int i = 0; i < 10; i++) {
+      std::string file_name = "data_" + ToString(i) + ".data";
+      ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+    delete_scheduler_->WaitForEmptyTrash();
+    ASSERT_EQ(bg_delete_file, 10 * run);
+    ASSERT_EQ(CountTrashFiles(), 0);
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 50);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DeletePartialFile) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // Should delete in 4 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024), ""));
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024), ""));
+  // Should delete in 2 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024), ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(7, bg_delete_file);
+  ASSERT_EQ(4, bg_fsync);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+#ifdef OS_LINUX
+TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  std::string file1 = NewDummyFile("data_1", 500 * 1024);
+  std::string file2 = NewDummyFile("data_2", 100 * 1024);
+
+  ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
+  ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
+
+  // Should delete in 4 batch if there is no hardlink
+  ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
+  ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(2, bg_delete_file);
+  ASSERT_EQ(0, bg_fsync);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+}
+#endif
+
+// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
+// 2- Delete 100 files using DeleteScheduler
+// 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
+// 4- Make sure that not all files were deleted from trash and that
+//    DeleteScheduler background thread did not delete all files
+TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1;  // 1 Byte / sec
+  NewDeleteScheduler();
+
+  for (int i = 0; i < 100; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+
+  // Deleting 100 files will need >28 hours to delete
+  // we will delete the DeleteScheduler while delete queue is not empty
+  sst_file_mgr_.reset();
+
+  ASSERT_LT(bg_delete_file, 100);
+  ASSERT_GT(CountTrashFiles(), 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
+  std::vector<uint64_t> penalties;
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile",
+      [&](void* /*arg*/) { fg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::DynamicRateLimiting1:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;  // Disable rate limiting initially
+  NewDeleteScheduler();
+
+
+  int num_files = 10;  // 10 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<int64_t> delete_kbs_per_sec = {512, 200, 0, 100, 50, -2, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    bg_delete_file = 0;
+    fg_delete_file = 0;
+    rocksdb::SyncPoint::GetInstance()->ClearTrace();
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_);
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    if (rate_bytes_per_sec_ > 0) {
+      uint64_t delete_start_time = env_->NowMicros();
+      TEST_SYNC_POINT("DeleteSchedulerTest::DynamicRateLimiting1:1");
+      delete_scheduler_->WaitForEmptyTrash();
+      uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+      auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+      ASSERT_EQ(bg_errors.size(), 0);
+
+      uint64_t total_files_size = 0;
+      uint64_t expected_penlty = 0;
+      ASSERT_EQ(penalties.size(), num_files);
+      for (int i = 0; i < num_files; i++) {
+        total_files_size += file_size;
+        expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+        ASSERT_EQ(expected_penlty, penalties[i]);
+      }
+      ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+      ASSERT_EQ(bg_delete_file, num_files);
+      ASSERT_EQ(fg_delete_file, 0);
+    } else {
+      ASSERT_EQ(penalties.size(), 0);
+      ASSERT_EQ(bg_delete_file, 0);
+      ASSERT_EQ(fg_delete_file, num_files);
+    }
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_files = 100;  // 100 files
+  uint64_t file_size = 1024 * 10; // 100 KB as a file size
+  rate_bytes_per_sec_ = 1;  // 1 byte per sec (very slow trash delete)
+
+  NewDeleteScheduler();
+  delete_scheduler_->SetMaxTrashDBRatio(0.25);
+
+  std::vector<std::string> generated_files;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file" + ToString(i) + ".data";
+    generated_files.push_back(NewDummyFile(file_name, file_size));
+  }
+
+  for (std::string& file_name : generated_files) {
+    delete_scheduler_->DeleteFile(file_name, "");
+  }
+
+  // When we end up with 26 files in trash we will start
+  // deleting new files immediately
+  ASSERT_EQ(fg_delete_file, 74);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, IsTrashCheck) {
+  // Trash files
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("x.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile(".trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("/a/b/c/abc..sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("^^^^^.log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.t.trash"));
+
+  // Not trash files
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.txt"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sstrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("^^^^^.trashh"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile(".ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+int main(int /*argc*/, char** /*argv*/) {
+  printf("DeleteScheduler is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/duplicate_detector.h b/src/rocksdb/util/duplicate_detector.h
new file mode 100644
index 00000000..40a1cbd1
--- /dev/null
+++ b/src/rocksdb/util/duplicate_detector.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
+#include "util/set_comparator.h"
+
+namespace rocksdb {
+// During recovery if the memtable is flushed we cannot rely on its help on
+// duplicate key detection and as key insert will not be attempted. This class
+// will be used as a emulator of memtable to tell if insertion of a key/seq
+// would have resulted in duplication.
+class DuplicateDetector {
+ public:
+  explicit DuplicateDetector(DBImpl* db) : db_(db) {}
+  bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) {
+    assert(seq >= batch_seq_);
+    if (batch_seq_ != seq) {  // it is a new batch
+      keys_.clear();
+    }
+    batch_seq_ = seq;
+    CFKeys& cf_keys = keys_[cf];
+    if (cf_keys.size() == 0) {  // just inserted
+      InitWithComp(cf);
+    }
+    auto it = cf_keys.insert(key);
+    if (it.second == false) {  // second is false if a element already existed.
+      keys_.clear();
+      InitWithComp(cf);
+      keys_[cf].insert(key);
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  SequenceNumber batch_seq_ = 0;
+  DBImpl* db_;
+  using CFKeys = std::set<Slice, SetComparator>;
+  std::map<uint32_t, CFKeys> keys_;
+  void InitWithComp(const uint32_t cf) {
+    auto h = db_->GetColumnFamilyHandle(cf);
+    if (!h) {
+      // TODO(myabandeh): This is not a concern in MyRocks as drop cf is not
+      // implemented yet. When it does, we should return proper error instead
+      // of throwing exception.
+      ROCKS_LOG_FATAL(
+          db_->immutable_db_options().info_log,
+          "Recovering an entry from the dropped column family %" PRIu32
+          ". WAL must must have been emptied before dropping the column "
+          "family", cf);
+#ifndef ROCKSDB_LITE
+      throw std::runtime_error(
+          "Recovering an entry from a dropped column family. "
+          "WAL must must have been flushed before dropping the column "
+          "family");
+#endif
+      return;
+    }
+    auto cmp = h->GetComparator();
+    keys_[cf] = CFKeys(SetComparator(cmp));
+  }
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/dynamic_bloom.cc b/src/rocksdb/util/dynamic_bloom.cc
new file mode 100644
index 00000000..8e90efd8
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "dynamic_bloom.h"
+
+#include <algorithm>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/allocator.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_blocks =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_blocks an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_blocks % 2 == 0) {
+    num_blocks++;
+  }
+
+  return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+}
+
+DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
+                           uint32_t locality, uint32_t num_probes,
+                           size_t huge_page_tlb_size, Logger* logger)
+    : DynamicBloom(num_probes) {
+  SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger);
+}
+
+DynamicBloom::DynamicBloom(uint32_t num_probes)
+    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                              uint32_t num_blocks) {
+  data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw_data);
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
+void DynamicBloom::SetTotalBits(Allocator* allocator,
+                                uint32_t total_bits, uint32_t locality,
+                                size_t huge_page_tlb_size,
+                                Logger* logger) {
+  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+                              : (total_bits + 7) / 8 * 8;
+  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(kNumProbes > 0);
+
+  uint32_t sz = kTotalBits / 8;
+  if (kNumBlocks > 0) {
+    sz += CACHE_LINE_SIZE - 1;
+  }
+  assert(allocator);
+
+  char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+  memset(raw, 0, sz);
+  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+  if (kNumBlocks > 0 && cache_line_offset > 0) {
+    raw += CACHE_LINE_SIZE - cache_line_offset;
+  }
+  data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw);
+}
+
+}  // rocksdb
diff --git a/src/rocksdb/util/dynamic_bloom.h b/src/rocksdb/util/dynamic_bloom.h
new file mode 100644
index 00000000..654bc9ad
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/slice.h"
+
+#include "port/port.h"
+#include "util/hash.h"
+
+#include <atomic>
+#include <memory>
+
+namespace rocksdb {
+
+class Slice;
+class Allocator;
+class Logger;
+
+class DynamicBloom {
+ public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // locality:  If positive, optimize for cache line locality, 0 otherwise.
+  // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      within this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
+  explicit DynamicBloom(Allocator* allocator,
+                        uint32_t total_bits, uint32_t locality = 0,
+                        uint32_t num_probes = 6,
+                        size_t huge_page_tlb_size = 0,
+                        Logger* logger = nullptr);
+
+  explicit DynamicBloom(uint32_t num_probes = 6);
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
+
+  ~DynamicBloom() {}
+
+  // Assuming single threaded access to this function.
+  void Add(const Slice& key);
+
+  // Like Add, but may be called concurrent with other functions.
+  void AddConcurrently(const Slice& key);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Like AddHash, but may be called concurrent with other functions.
+  void AddHashConcurrently(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContain(const Slice& key) const;
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t h);
+
+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const {
+    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
+  }
+
+  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                  uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+  uint32_t kTotalBits;
+  uint32_t kNumBlocks;
+  const uint32_t kNumProbes;
+
+  std::atomic<uint8_t>* data_;
+
+  // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
+  // concurrency safety, working with bytes.
+  template <typename OrFunc>
+  void AddHash(uint32_t hash, const OrFunc& or_func);
+};
+
+inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
+
+inline void DynamicBloom::AddConcurrently(const Slice& key) {
+  AddHashConcurrently(BloomHash(key));
+}
+
+inline void DynamicBloom::AddHash(uint32_t hash) {
+  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+    ptr->store(ptr->load(std::memory_order_relaxed) | mask,
+               std::memory_order_relaxed);
+  });
+}
+
+inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
+  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+    // Happens-before between AddHash and MaybeContains is handled by
+    // access to versions_->LastSequence(), so all we have to do here is
+    // avoid races (so we don't give the compiler a license to mess up
+    // our code) and not lose bits.  std::memory_order_relaxed is enough
+    // for that.
+    if ((mask & ptr->load(std::memory_order_relaxed)) != mask) {
+      ptr->fetch_or(mask, std::memory_order_relaxed);
+    }
+  });
+}
+
+inline bool DynamicBloom::MayContain(const Slice& key) const {
+  return (MayContainHash(BloomHash(key)));
+}
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189) 
+#endif
+inline void DynamicBloom::Prefetch(uint32_t h) {
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    PREFETCH(&(data_[b / 8]), 0, 3);
+  }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool DynamicBloom::MayContainHash(uint32_t h) const {
+  assert(IsInitialized());
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      //  to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
+      if ((byteval & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
+      h += delta;
+    }
+  } else {
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
+      if ((byteval & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+  }
+  return true;
+}
+
+template <typename OrFunc>
+inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) {
+  assert(IsInitialized());
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      // to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
+      h += delta;
+    }
+  } else {
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
+      h += delta;
+    }
+  }
+}
+
+}  // rocksdb
diff --git a/src/rocksdb/util/dynamic_bloom_test.cc b/src/rocksdb/util/dynamic_bloom_test.cc
new file mode 100644
index 00000000..4244bff1
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom_test.cc
@@ -0,0 +1,340 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "dynamic_bloom.h"
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/gflags_compat.h"
+#include "util/logging.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(bits_per_key, 10, "");
+DEFINE_int32(num_probes, 6, "");
+DEFINE_bool(enable_perf, false, "");
+
+namespace rocksdb {
+
+static Slice Key(uint64_t i, char* buffer) {
+  memcpy(buffer, &i, sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+class DynamicBloomTest : public testing::Test {};
+
+TEST_F(DynamicBloomTest, EmptyFilter) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 0, 2);
+  ASSERT_TRUE(!bloom1.MayContain("hello"));
+  ASSERT_TRUE(!bloom1.MayContain("world"));
+
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  ASSERT_TRUE(!bloom2.MayContain("hello"));
+  ASSERT_TRUE(!bloom2.MayContain("world"));
+}
+
+TEST_F(DynamicBloomTest, Small) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 0, 2);
+  bloom1.Add("hello");
+  bloom1.Add("world");
+  ASSERT_TRUE(bloom1.MayContain("hello"));
+  ASSERT_TRUE(bloom1.MayContain("world"));
+  ASSERT_TRUE(!bloom1.MayContain("x"));
+  ASSERT_TRUE(!bloom1.MayContain("foo"));
+
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  bloom2.Add("hello");
+  bloom2.Add("world");
+  ASSERT_TRUE(bloom2.MayContain("hello"));
+  ASSERT_TRUE(bloom2.MayContain("world"));
+  ASSERT_TRUE(!bloom2.MayContain("x"));
+  ASSERT_TRUE(!bloom2.MayContain("foo"));
+}
+
+TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 0, 2);
+  bloom1.AddConcurrently("hello");
+  bloom1.AddConcurrently("world");
+  ASSERT_TRUE(bloom1.MayContain("hello"));
+  ASSERT_TRUE(bloom1.MayContain("world"));
+  ASSERT_TRUE(!bloom1.MayContain("x"));
+  ASSERT_TRUE(!bloom1.MayContain("foo"));
+
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  bloom2.AddConcurrently("hello");
+  bloom2.AddConcurrently("world");
+  ASSERT_TRUE(bloom2.MayContain("hello"));
+  ASSERT_TRUE(bloom2.MayContain("world"));
+  ASSERT_TRUE(!bloom2.MayContain("x"));
+  ASSERT_TRUE(!bloom2.MayContain("foo"));
+}
+
+static uint32_t NextNum(uint32_t num) {
+  if (num < 10) {
+    num += 1;
+  } else if (num < 100) {
+    num += 10;
+  } else if (num < 1000) {
+    num += 100;
+  } else {
+    num += 1000;
+  }
+  return num;
+}
+
+TEST_F(DynamicBloomTest, VaryingLengths) {
+  char buffer[sizeof(uint64_t)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  fprintf(stderr, "bits_per_key: %d  num_probes: %d\n", FLAGS_bits_per_key,
+          num_probes);
+
+  for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) {
+    for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
+      uint32_t bloom_bits = 0;
+      Arena arena;
+      if (enable_locality == 0) {
+        bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
+      } else {
+        bloom_bits = std::max(num * FLAGS_bits_per_key,
+                              enable_locality * CACHE_LINE_SIZE * 8);
+      }
+      DynamicBloom bloom(&arena, bloom_bits, enable_locality, num_probes);
+      for (uint64_t i = 0; i < num; i++) {
+        bloom.Add(Key(i, buffer));
+        ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+      }
+
+      // All added keys must match
+      for (uint64_t i = 0; i < num; i++) {
+        ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num
+                                                      << "; key " << i;
+      }
+
+      // Check false positive rate
+
+      int result = 0;
+      for (uint64_t i = 0; i < 10000; i++) {
+        if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+          result++;
+        }
+      }
+      double rate = result / 10000.0;
+
+      fprintf(stderr,
+              "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
+              "enable locality?%u\n",
+              rate * 100.0, num, bloom_bits, enable_locality);
+
+      if (rate > 0.0125)
+        mediocre_filters++;  // Allowed, but not too often
+      else
+        good_filters++;
+    }
+
+    fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
+            mediocre_filters);
+    ASSERT_LE(mediocre_filters, good_filters / 5);
+  }
+}
+
+TEST_F(DynamicBloomTest, perf) {
+  StopWatchNano timer(Env::Default());
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  if (!FLAGS_enable_perf) {
+    return;
+  }
+
+  for (uint32_t m = 1; m <= 8; ++m) {
+    Arena arena;
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
+
+    DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes);
+
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
+    }
+
+    uint64_t elapsed = timer.ElapsedNanos();
+    fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
+            elapsed / num_keys);
+
+    uint32_t count = 0;
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) {
+        ++count;
+      }
+    }
+    ASSERT_EQ(count, num_keys);
+    elapsed = timer.ElapsedNanos();
+    assert(count > 0);
+    fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
+            elapsed / count);
+
+    // Locality enabled version
+    DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes);
+
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
+    }
+
+    elapsed = timer.ElapsedNanos();
+    fprintf(stderr,
+            "blocked bloom(enable locality), avg add latency %" PRIu64 "\n",
+            elapsed / num_keys);
+
+    count = 0;
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      if (blocked_bloom.MayContain(
+              Slice(reinterpret_cast<const char*>(&i), 8))) {
+        ++count;
+      }
+    }
+
+    elapsed = timer.ElapsedNanos();
+    assert(count > 0);
+    fprintf(stderr,
+            "blocked bloom(enable locality), avg query latency %" PRIu64 "\n",
+            elapsed / count);
+    ASSERT_TRUE(count == num_keys);
+  }
+}
+
+TEST_F(DynamicBloomTest, concurrent_with_perf) {
+  StopWatchNano timer(Env::Default());
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  uint32_t m_limit = FLAGS_enable_perf ? 8 : 1;
+  uint32_t locality_limit = FLAGS_enable_perf ? 1 : 0;
+
+  uint32_t num_threads = 4;
+  std::vector<port::Thread> threads;
+
+  for (uint32_t m = 1; m <= m_limit; ++m) {
+    for (uint32_t locality = 0; locality <= locality_limit; ++locality) {
+      Arena arena;
+      const uint32_t num_keys = m * 8 * 1024 * 1024;
+      fprintf(stderr, "testing %" PRIu32 "M keys with %" PRIu32 " locality\n",
+              m * 8, locality);
+
+      DynamicBloom std_bloom(&arena, num_keys * 10, locality, num_probes);
+
+      timer.Start();
+
+      std::function<void(size_t)> adder([&](size_t t) {
+        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+          std_bloom.AddConcurrently(
+              Slice(reinterpret_cast<const char*>(&i), 8));
+        }
+      });
+      for (size_t t = 0; t < num_threads; ++t) {
+        threads.emplace_back(adder, t);
+      }
+      while (threads.size() > 0) {
+        threads.back().join();
+        threads.pop_back();
+      }
+
+      uint64_t elapsed = timer.ElapsedNanos();
+      fprintf(stderr, "standard bloom, avg parallel add latency %" PRIu64
+                      " nanos/key\n",
+              elapsed / num_keys);
+
+      timer.Start();
+
+      std::function<void(size_t)> hitter([&](size_t t) {
+        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+          bool f =
+              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+          ASSERT_TRUE(f);
+        }
+      });
+      for (size_t t = 0; t < num_threads; ++t) {
+        threads.emplace_back(hitter, t);
+      }
+      while (threads.size() > 0) {
+        threads.back().join();
+        threads.pop_back();
+      }
+
+      elapsed = timer.ElapsedNanos();
+      fprintf(stderr, "standard bloom, avg parallel hit latency %" PRIu64
+                      " nanos/key\n",
+              elapsed / num_keys);
+
+      timer.Start();
+
+      std::atomic<uint32_t> false_positives(0);
+      std::function<void(size_t)> misser([&](size_t t) {
+        for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
+             i += num_threads) {
+          bool f =
+              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+          if (f) {
+            ++false_positives;
+          }
+        }
+      });
+      for (size_t t = 0; t < num_threads; ++t) {
+        threads.emplace_back(misser, t);
+      }
+      while (threads.size() > 0) {
+        threads.back().join();
+        threads.pop_back();
+      }
+
+      elapsed = timer.ElapsedNanos();
+      fprintf(stderr, "standard bloom, avg parallel miss latency %" PRIu64
+                      " nanos/key, %f%% false positive rate\n",
+              elapsed / num_keys, false_positives.load() * 100.0 / num_keys);
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/event_logger.cc b/src/rocksdb/util/event_logger.cc
new file mode 100644
index 00000000..b488984f
--- /dev/null
+++ b/src/rocksdb/util/event_logger.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "util/event_logger.h"
+
+#include <inttypes.h>
+#include <cassert>
+#include <sstream>
+#include <string>
+
+#include "util/logging.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+
+EventLoggerStream::EventLoggerStream(Logger* logger)
+    : logger_(logger), log_buffer_(nullptr), json_writer_(nullptr) {}
+
+EventLoggerStream::EventLoggerStream(LogBuffer* log_buffer)
+    : logger_(nullptr), log_buffer_(log_buffer), json_writer_(nullptr) {}
+
+EventLoggerStream::~EventLoggerStream() {
+  if (json_writer_) {
+    json_writer_->EndObject();
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+    printf("%s\n", json_writer_->Get().c_str());
+#else
+    if (logger_) {
+      EventLogger::Log(logger_, *json_writer_);
+    } else if (log_buffer_) {
+      EventLogger::LogToBuffer(log_buffer_, *json_writer_);
+    }
+#endif
+    delete json_writer_;
+  }
+}
+
+void EventLogger::Log(const JSONWriter& jwriter) {
+  Log(logger_, jwriter);
+}
+
+void EventLogger::Log(Logger* logger, const JSONWriter& jwriter) {
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+  printf("%s\n", jwriter.Get().c_str());
+#else
+  rocksdb::Log(logger, "%s %s", Prefix(), jwriter.Get().c_str());
+#endif
+}
+
+void EventLogger::LogToBuffer(
+    LogBuffer* log_buffer, const JSONWriter& jwriter) {
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+  printf("%s\n", jwriter.Get().c_str());
+#else
+  assert(log_buffer);
+  rocksdb::LogToBuffer(log_buffer, "%s %s", Prefix(), jwriter.Get().c_str());
+#endif
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/event_logger.h b/src/rocksdb/util/event_logger.h
new file mode 100644
index 00000000..d88a6a4f
--- /dev/null
+++ b/src/rocksdb/util/event_logger.h
@@ -0,0 +1,196 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <chrono>
+
+#include "rocksdb/env.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+class JSONWriter {
+ public:
+  JSONWriter() : state_(kExpectKey), first_element_(true), in_array_(false) {
+    stream_ << "{";
+  }
+
+  void AddKey(const std::string& key) {
+    assert(state_ == kExpectKey);
+    if (!first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << "\"" << key << "\": ";
+    state_ = kExpectValue;
+    first_element_ = false;
+  }
+
+  void AddValue(const char* value) {
+    assert(state_ == kExpectValue || state_ == kInArray);
+    if (state_ == kInArray && !first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << "\"" << value << "\"";
+    if (state_ != kInArray) {
+      state_ = kExpectKey;
+    }
+    first_element_ = false;
+  }
+
+  template <typename T>
+  void AddValue(const T& value) {
+    assert(state_ == kExpectValue || state_ == kInArray);
+    if (state_ == kInArray && !first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << value;
+    if (state_ != kInArray) {
+      state_ = kExpectKey;
+    }
+    first_element_ = false;
+  }
+
+  void StartArray() {
+    assert(state_ == kExpectValue);
+    state_ = kInArray;
+    in_array_ = true;
+    stream_ << "[";
+    first_element_ = true;
+  }
+
+  void EndArray() {
+    assert(state_ == kInArray);
+    state_ = kExpectKey;
+    in_array_ = false;
+    stream_ << "]";
+    first_element_ = false;
+  }
+
+  void StartObject() {
+    assert(state_ == kExpectValue);
+    state_ = kExpectKey;
+    stream_ << "{";
+    first_element_ = true;
+  }
+
+  void EndObject() {
+    assert(state_ == kExpectKey);
+    stream_ << "}";
+    first_element_ = false;
+  }
+
+  void StartArrayedObject() {
+    assert(state_ == kInArray && in_array_);
+    state_ = kExpectValue;
+    if (!first_element_) {
+      stream_ << ", ";
+    }
+    StartObject();
+  }
+
+  void EndArrayedObject() {
+    assert(in_array_);
+    EndObject();
+    state_ = kInArray;
+  }
+
+  std::string Get() const { return stream_.str(); }
+
+  JSONWriter& operator<<(const char* val) {
+    if (state_ == kExpectKey) {
+      AddKey(val);
+    } else {
+      AddValue(val);
+    }
+    return *this;
+  }
+
+  JSONWriter& operator<<(const std::string& val) {
+    return *this << val.c_str();
+  }
+
+  template <typename T>
+  JSONWriter& operator<<(const T& val) {
+    assert(state_ != kExpectKey);
+    AddValue(val);
+    return *this;
+  }
+
+ private:
+  enum JSONWriterState {
+    kExpectKey,
+    kExpectValue,
+    kInArray,
+    kInArrayedObject,
+  };
+  JSONWriterState state_;
+  bool first_element_;
+  bool in_array_;
+  std::ostringstream stream_;
+};
+
+class EventLoggerStream {
+ public:
+  template <typename T>
+  EventLoggerStream& operator<<(const T& val) {
+    MakeStream();
+    *json_writer_ << val;
+    return *this;
+  }
+
+  void StartArray() { json_writer_->StartArray(); }
+  void EndArray() { json_writer_->EndArray(); }
+  void StartObject() { json_writer_->StartObject(); }
+  void EndObject() { json_writer_->EndObject(); }
+
+  ~EventLoggerStream();
+
+ private:
+  void MakeStream() {
+    if (!json_writer_) {
+      json_writer_ = new JSONWriter();
+      *this << "time_micros"
+            << std::chrono::duration_cast<std::chrono::microseconds>(
+                   std::chrono::system_clock::now().time_since_epoch()).count();
+    }
+  }
+  friend class EventLogger;
+  explicit EventLoggerStream(Logger* logger);
+  explicit EventLoggerStream(LogBuffer* log_buffer);
+  // exactly one is non-nullptr
+  Logger* const logger_;
+  LogBuffer* const log_buffer_;
+  // ownership
+  JSONWriter* json_writer_;
+};
+
+// here is an example of the output that will show up in the LOG:
+// 2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros":
+// 1421360005788015, "event": "table_file_creation", "file_number": 12,
+// "file_size": 1909699}
+class EventLogger {
+ public:
+  static const char* Prefix() {
+    return "EVENT_LOG_v1";
+  }
+
+  explicit EventLogger(Logger* logger) : logger_(logger) {}
+  EventLoggerStream Log() { return EventLoggerStream(logger_); }
+  EventLoggerStream LogToBuffer(LogBuffer* log_buffer) {
+    return EventLoggerStream(log_buffer);
+  }
+  void Log(const JSONWriter& jwriter);
+  static void Log(Logger* logger, const JSONWriter& jwriter);
+  static void LogToBuffer(LogBuffer* log_buffer, const JSONWriter& jwriter);
+
+ private:
+  Logger* logger_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/event_logger_test.cc b/src/rocksdb/util/event_logger_test.cc
new file mode 100644
index 00000000..4bcf30ff
--- /dev/null
+++ b/src/rocksdb/util/event_logger_test.cc
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+
+#include "util/event_logger.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class EventLoggerTest : public testing::Test {};
+
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    vsnprintf(buffer_, sizeof(buffer_), format, ap);
+  }
+  char* buffer() { return buffer_; }
+
+ private:
+  char buffer_[1000];
+};
+
+TEST_F(EventLoggerTest, SimpleTest) {
+  StringLogger logger;
+  EventLogger event_logger(&logger);
+  event_logger.Log() << "id" << 5 << "event"
+                     << "just_testing";
+  std::string output(logger.buffer());
+  ASSERT_TRUE(output.find("\"event\": \"just_testing\"") != std::string::npos);
+  ASSERT_TRUE(output.find("\"id\": 5") != std::string::npos);
+  ASSERT_TRUE(output.find("\"time_micros\"") != std::string::npos);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/fault_injection_test_env.cc b/src/rocksdb/util/fault_injection_test_env.cc
new file mode 100644
index 00000000..9cad2387
--- /dev/null
+++ b/src/rocksdb/util/fault_injection_test_env.cc
@@ -0,0 +1,367 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "util/fault_injection_test_env.h"
+#include <functional>
+#include <utility>
+
+namespace rocksdb {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+std::string GetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(Env* env, const std::string& filename, uint64_t length) {
+  std::unique_ptr<SequentialFile> orig_file;
+  const EnvOptions options;
+  Status s = env->NewSequentialFile(filename, &orig_file, options);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot open file %s for truncation: %s\n",
+            filename.c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  std::unique_ptr<char[]> scratch(new char[length]);
+  rocksdb::Slice result;
+  s = orig_file->Read(length, &result, scratch.get());
+#ifdef OS_WIN
+  orig_file.reset();
+#endif
+  if (s.ok()) {
+    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+    std::unique_ptr<WritableFile> tmp_file;
+    s = env->NewWritableFile(tmp_name, &tmp_file, options);
+    if (s.ok()) {
+      s = tmp_file->Append(result);
+      if (s.ok()) {
+        s = env->RenameFile(tmp_name, filename);
+      } else {
+        fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
+                filename.c_str(), s.ToString().c_str());
+        env->DeleteFile(tmp_name);
+      }
+    }
+  }
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+  }
+
+  return s;
+}
+
+// Trim the tailing "/" in the end of `str`
+std::string TrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+std::pair<std::string, std::string> GetDirAndName(const std::string& name) {
+  std::string dirname = GetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
+}
+
+Status FileState::DropUnsyncedData(Env* env) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  return Truncate(env, filename_, sync_pos);
+}
+
+Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  assert(pos_ >= sync_pos);
+  int range = static_cast<int>(pos_ - sync_pos);
+  uint64_t truncated_size =
+      static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
+  return Truncate(env, filename_, truncated_size);
+}
+
+Status TestDirectory::Fsync() {
+  env_->SyncDir(dirname_);
+  return dir_->Fsync();
+}
+
+TestWritableFile::TestWritableFile(const std::string& fname,
+                                   std::unique_ptr<WritableFile>&& f,
+                                   FaultInjectionTestEnv* env)
+    : state_(fname),
+      target_(std::move(f)),
+      writable_file_opened_(true),
+      env_(env) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestWritableFile::~TestWritableFile() {
+  if (writable_file_opened_) {
+    Close();
+  }
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  Status s = target_->Append(data);
+  if (s.ok()) {
+    state_.pos_ += data.size();
+    env_->WritableFileAppended(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Close() {
+  writable_file_opened_ = false;
+  Status s = target_->Close();
+  if (s.ok()) {
+    env_->WritableFileClosed(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Flush() {
+  Status s = target_->Flush();
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return s;
+}
+
+Status TestWritableFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return Status::IOError("FaultInjectionTestEnv: not active");
+  }
+  // No need to actual sync.
+  state_.pos_at_last_sync_ = state_.pos_;
+  env_->WritableFileSynced(state_);
+  return Status::OK();
+}
+
+Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
+                                           std::unique_ptr<Directory>* result) {
+  std::unique_ptr<Directory> r;
+  Status s = target()->NewDirectory(name, &r);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+  result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
+  return Status::OK();
+}
+
+Status FaultInjectionTestEnv::NewWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  // Not allow overwriting files
+  Status s = target()->FileExists(fname);
+  if (s.ok()) {
+    return Status::Corruption("File already exists.");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+  s = target()->NewWritableFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestWritableFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::ReopenWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = target()->ReopenWritableFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestWritableFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomAccessFile(
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  return target()->NewRandomAccessFile(fname, result, soptions);
+}
+
+Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = EnvWrapper::DeleteFile(f);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
+            s.ToString().c_str());
+  }
+  assert(s.ok());
+  if (s.ok()) {
+    UntrackFile(f);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::RenameFile(const std::string& s,
+                                         const std::string& t) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status ret = EnvWrapper::RenameFile(s, t);
+
+  if (ret.ok()) {
+    MutexLock l(&mutex_);
+    if (db_file_state_.find(s) != db_file_state_.end()) {
+      db_file_state_[t] = db_file_state_[s];
+      db_file_state_.erase(s);
+    }
+
+    auto sdn = GetDirAndName(s);
+    auto tdn = GetDirAndName(t);
+    if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+      auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+      assert(tlist.find(tdn.second) == tlist.end());
+      tlist.insert(tdn.second);
+    }
+  }
+
+  return ret;
+}
+
+void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_files_.find(state.filename_) != open_files_.end()) {
+    db_file_state_[state.filename_] = state;
+    open_files_.erase(state.filename_);
+  }
+}
+
+void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_files_.find(state.filename_) != open_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_files_.find(state.filename_) != open_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+// For every file that is not fully synced, make a call to `func` with
+// FileState of the file as the parameter.
+Status FaultInjectionTestEnv::DropFileData(
+    std::function<Status(Env*, FileState)> func) {
+  Status s;
+  MutexLock l(&mutex_);
+  for (std::map<std::string, FileState>::const_iterator it =
+           db_file_state_.begin();
+       s.ok() && it != db_file_state_.end(); ++it) {
+    const FileState& state = it->second;
+    if (!state.IsFullySynced()) {
+      s = func(target(), state);
+    }
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::DropUnsyncedFileData() {
+  return DropFileData([&](Env* env, const FileState& state) {
+    return state.DropUnsyncedData(env);
+  });
+}
+
+Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) {
+  return DropFileData([&](Env* env, const FileState& state) {
+    return state.DropRandomUnsyncedData(env, rnd);
+  });
+}
+
+Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
+  // Because DeleteFile access this container make a copy to avoid deadlock
+  std::map<std::string, std::set<std::string>> map_copy;
+  {
+    MutexLock l(&mutex_);
+    map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                    dir_to_new_files_since_last_sync_.end());
+  }
+
+  for (auto& pair : map_copy) {
+    for (std::string name : pair.second) {
+      Status s = DeleteFile(pair.first + "/" + name);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+  return Status::OK();
+}
+void FaultInjectionTestEnv::ResetState() {
+  MutexLock l(&mutex_);
+  db_file_state_.clear();
+  dir_to_new_files_since_last_sync_.clear();
+  SetFilesystemActiveNoLock(true);
+}
+
+void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
+  MutexLock l(&mutex_);
+  auto dir_and_name = GetDirAndName(f);
+  dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+      dir_and_name.second);
+  db_file_state_.erase(f);
+  open_files_.erase(f);
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/fault_injection_test_env.h b/src/rocksdb/util/fault_injection_test_env.h
new file mode 100644
index 00000000..7c5a080f
--- /dev/null
+++ b/src/rocksdb/util/fault_injection_test_env.h
@@ -0,0 +1,194 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/filename.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+class TestWritableFile;
+class FaultInjectionTestEnv;
+
+struct FileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+
+  explicit FileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) {}
+
+  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  Status DropUnsyncedData(Env* env) const;
+
+  Status DropRandomUnsyncedData(Env* env, Random* rand) const;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+  explicit TestWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestWritableFile();
+  virtual Status Append(const Slice& data) override;
+  virtual Status Truncate(uint64_t size) override {
+    return target_->Truncate(size);
+  }
+  virtual Status Close() override;
+  virtual Status Flush() override;
+  virtual Status Sync() override;
+  virtual bool IsSyncThreadSafe() const override { return true; }
+  virtual Status PositionedAppend(const Slice& data,
+                                  uint64_t offset) override {
+    return target_->PositionedAppend(data, offset);
+  }
+  virtual bool use_direct_io() const override {
+    return target_->use_direct_io();
+  };
+
+ private:
+  FileState state_;
+  std::unique_ptr<WritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
+class TestDirectory : public Directory {
+ public:
+  explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
+                         Directory* dir)
+      : env_(env), dirname_(dirname), dir_(dir) {}
+  ~TestDirectory() {}
+
+  virtual Status Fsync() override;
+
+ private:
+  FaultInjectionTestEnv* env_;
+  std::string dirname_;
+  std::unique_ptr<Directory> dir_;
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+  explicit FaultInjectionTestEnv(Env* base)
+      : EnvWrapper(base), filesystem_active_(true) {}
+  virtual ~FaultInjectionTestEnv() {}
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override;
+
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& soptions) override;
+
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& soptions) override;
+
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& soptions) override;
+
+  virtual Status DeleteFile(const std::string& f) override;
+
+  virtual Status RenameFile(const std::string& s,
+                            const std::string& t) override;
+
+  virtual Status GetFreeSpace(const std::string& path,
+                              uint64_t* disk_free) override {
+    if (!IsFilesystemActive() && error_ == Status::NoSpace()) {
+      *disk_free = 0;
+      return Status::OK();
+    } else {
+      return target()->GetFreeSpace(path, disk_free);
+    }
+  }
+
+  void WritableFileClosed(const FileState& state);
+
+  void WritableFileSynced(const FileState& state);
+
+  void WritableFileAppended(const FileState& state);
+
+  // For every file that is not fully synced, make a call to `func` with
+  // FileState of the file as the parameter.
+  Status DropFileData(std::function<Status(Env*, FileState)> func);
+
+  Status DropUnsyncedFileData();
+
+  Status DropRandomUnsyncedFileData(Random* rnd);
+
+  Status DeleteFilesCreatedAfterLastDirSync();
+
+  void ResetState();
+
+  void UntrackFile(const std::string& f);
+
+  void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
+    dir_to_new_files_since_last_sync_.erase(dirname);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+  void SetFilesystemActiveNoLock(bool active,
+      Status error = Status::Corruption("Not active")) {
+    filesystem_active_ = active;
+    if (!active) {
+      error_ = error;
+    }
+  }
+  void SetFilesystemActive(bool active,
+      Status error = Status::Corruption("Not active")) {
+    MutexLock l(&mutex_);
+    SetFilesystemActiveNoLock(active, error);
+  }
+  void AssertNoOpenFile() { assert(open_files_.empty()); }
+  Status GetError() { return error_; }
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FileState> db_file_state_;
+  std::set<std::string> open_files_;
+  std::unordered_map<std::string, std::set<std::string>>
+      dir_to_new_files_since_last_sync_;
+  bool filesystem_active_;  // Record flushes, syncs, writes
+  Status error_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/file_reader_writer.cc b/src/rocksdb/util/file_reader_writer.cc
new file mode 100644
index 00000000..9a818cb0
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer.cc
@@ -0,0 +1,867 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/file_reader_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+#ifndef NDEBUG
+namespace {
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+}
+#endif
+
+Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
+  Status s;
+  if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+    size_t offset = offset_.fetch_add(n);
+    size_t alignment = file_->GetRequiredBufferAlignment();
+    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
+    size_t offset_advance = offset - aligned_offset;
+    size_t size = Roundup(offset + n, alignment) - aligned_offset;
+    size_t r = 0;
+    AlignedBuffer buf;
+    buf.Alignment(alignment);
+    buf.AllocateNewBuffer(size);
+    Slice tmp;
+    s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart());
+    if (s.ok() && offset_advance < tmp.size()) {
+      buf.Size(tmp.size());
+      r = buf.Read(scratch, offset_advance,
+                   std::min(tmp.size() - offset_advance, n));
+    }
+    *result = Slice(scratch, r);
+#endif  // !ROCKSDB_LITE
+  } else {
+    s = file_->Read(n, result, scratch);
+  }
+  IOSTATS_ADD(bytes_read, result->size());
+  return s;
+}
+
+
+Status SequentialFileReader::Skip(uint64_t n) {
+#ifndef ROCKSDB_LITE
+  if (use_direct_io()) {
+    offset_ += static_cast<size_t>(n);
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+  return file_->Skip(n);
+}
+
+Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
+                                    char* scratch) const {
+  Status s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size = Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(read_size);
+      while (buf.CurrentSize() < read_size) {
+        size_t allowed;
+        if (for_compaction_ && rate_limiter_ != nullptr) {
+          allowed = rate_limiter_->RequestToken(
+              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+              Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
+        } else {
+          assert(buf.CurrentSize() == 0);
+          allowed = read_size;
+        }
+        Slice tmp;
+
+        FileOperationInfo::TimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
+                          buf.Destination());
+        }
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 s);
+        }
+
+        buf.Size(buf.CurrentSize() + tmp.size());
+        if (!s.ok() || tmp.size() < allowed) {
+          break;
+        }
+      }
+      size_t res_len = 0;
+      if (s.ok() && offset_advance < buf.CurrentSize()) {
+        res_len = buf.Read(scratch, offset_advance,
+                           std::min(buf.CurrentSize() - offset_advance, n));
+      }
+      *result = Slice(scratch, res_len);
+#endif  // !ROCKSDB_LITE
+    } else {
+      size_t pos = 0;
+      const char* res_scratch = nullptr;
+      while (pos < n) {
+        size_t allowed;
+        if (for_compaction_ && rate_limiter_ != nullptr) {
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStart();
+          }
+          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
+                                                Env::IOPriority::IO_LOW, stats_,
+                                                RateLimiter::OpType::kRead);
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStop();
+          }
+        } else {
+          allowed = n;
+        }
+        Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::TimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+        }
+#endif
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
+        }
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, s);
+        }
+#endif
+
+        if (res_scratch == nullptr) {
+          // we can't simply use `scratch` because reads of mmap'd files return
+          // data in a different buffer.
+          res_scratch = tmp_result.data();
+        } else {
+          // make sure chunks are inserted contiguously into `res_scratch`.
+          assert(tmp_result.data() == res_scratch + pos);
+        }
+        pos += tmp_result.size();
+        if (!s.ok() || tmp_result.size() < allowed) {
+          break;
+        }
+      }
+      *result = Slice(res_scratch, s.ok() ? pos : 0);
+    }
+    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+
+Status WritableFileWriter::Append(const Slice& data) {
+  const char* src = data.data();
+  size_t left = data.size();
+  Status s;
+  pending_sync_ = true;
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  {
+    IOSTATS_TIMER_GUARD(prepare_write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
+  }
+
+  // See whether we need to enlarge the buffer to avoid the flush
+  if (buf_.Capacity() - buf_.CurrentSize() < left) {
+    for (size_t cap = buf_.Capacity();
+         cap < max_buffer_size_;  // There is still room to increase
+         cap *= 2) {
+      // See whether the next available size is large enough.
+      // Buffer will never be increased to more than max_buffer_size_.
+      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
+      if (desired_capacity - buf_.CurrentSize() >= left ||
+          (use_direct_io() && desired_capacity == max_buffer_size_)) {
+        buf_.AllocateNewBuffer(desired_capacity, true);
+        break;
+      }
+    }
+  }
+
+  // Flush only when buffered I/O
+  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
+    if (buf_.CurrentSize() > 0) {
+      s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    assert(buf_.CurrentSize() == 0);
+  }
+
+  // We never write directly to disk with direct I/O on.
+  // or we simply use it for its original purpose to accumulate many small
+  // chunks
+  if (use_direct_io() || (buf_.Capacity() >= left)) {
+    while (left > 0) {
+      size_t appended = buf_.Append(src, left);
+      left -= appended;
+      src += appended;
+
+      if (left > 0) {
+        s = Flush();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+  } else {
+    // Writing directly to file bypassing the buffer
+    assert(buf_.CurrentSize() == 0);
+    s = WriteBuffered(src, left);
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
+  if (s.ok()) {
+    filesize_ += data.size();
+  }
+  return s;
+}
+
+Status WritableFileWriter::Pad(const size_t pad_bytes) {
+  assert(pad_bytes < kDefaultPageSize);
+  size_t left = pad_bytes;
+  size_t cap = buf_.Capacity() - buf_.CurrentSize();
+
+  // Assume pad_bytes is small compared to buf_ capacity. So we always
+  // use buf_ rather than write directly to file in certain cases like
+  // Append() does.
+  while (left) {
+    size_t append_bytes = std::min(cap, left);
+    buf_.PadWith(append_bytes, 0);
+    left -= append_bytes;
+    if (left > 0) {
+      Status s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    cap = buf_.Capacity() - buf_.CurrentSize();
+  }
+  pending_sync_ = true;
+  filesize_ += pad_bytes;
+  return Status::OK();
+}
+
+Status WritableFileWriter::Close() {
+
+  // Do not quit immediately on failure the file MUST be closed
+  Status s;
+
+  // Possible to close it twice now as we MUST close
+  // in __dtor, simply flushing is not enough
+  // Windows when pre-allocating does not fill with zeros
+  // also with unbuffered access we also set the end of data.
+  if (!writable_file_) {
+    return s;
+  }
+
+  s = Flush();  // flush cache to OS
+
+  Status interim;
+  // In direct I/O mode we write whole pages so
+  // we need to let the file know where data ends.
+  if (use_direct_io()) {
+    interim = writable_file_->Truncate(filesize_);
+    if (interim.ok()) {
+      interim = writable_file_->Fsync();
+    }
+    if (!interim.ok() && s.ok()) {
+      s = interim;
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
+  interim = writable_file_->Close();
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  writable_file_.reset();
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
+
+  return s;
+}
+
+// write out the cached data to the OS cache or storage if direct I/O
+// enabled
+Status WritableFileWriter::Flush() {
+  Status s;
+  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  if (buf_.CurrentSize() > 0) {
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      if (pending_sync_) {
+        s = WriteDirect();
+      }
+#endif  // !ROCKSDB_LITE
+    } else {
+      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  s = writable_file_->Flush();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // sync OS cache to disk for every bytes_per_sync_
+  // TODO: give log file and sst file different options (log
+  // files could be potentially cached in OS for their whole
+  // life time, thus we might not want to flush at all).
+
+  // We try to avoid sync to the last 1MB of data. For two reasons:
+  // (1) avoid rewrite the same page that is modified later.
+  // (2) for older version of OS, write can block while writing out
+  //     the page.
+  // Xfs does neighbor page flushing outside of the specified ranges. We
+  // need to make sure sync range is far from the write offset.
+  if (!use_direct_io() && bytes_per_sync_) {
+    const uint64_t kBytesNotSyncRange = 1024 * 1024;  // recent 1MB is not synced.
+    const uint64_t kBytesAlignWhenSync = 4 * 1024;    // Align 4KB.
+    if (filesize_ > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
+      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+      assert(offset_sync_to >= last_sync_size_);
+      if (offset_sync_to > 0 &&
+          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+        last_sync_size_ = offset_sync_to;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status WritableFileWriter::Sync(bool use_fsync) {
+  Status s = Flush();
+  if (!s.ok()) {
+    return s;
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
+  if (!use_direct_io() && pending_sync_) {
+    s = SyncInternal(use_fsync);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
+  pending_sync_ = false;
+  return Status::OK();
+}
+
+Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+  if (!writable_file_->IsSyncThreadSafe()) {
+    return Status::NotSupported(
+      "Can't WritableFileWriter::SyncWithoutFlush() because "
+      "WritableFile::IsSyncThreadSafe() is false");
+  }
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+  Status s = SyncInternal(use_fsync);
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+  return s;
+}
+
+Status WritableFileWriter::SyncInternal(bool use_fsync) {
+  Status s;
+  IOSTATS_TIMER_GUARD(fsync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  auto prev_perf_level = GetPerfLevel();
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+  if (use_fsync) {
+    s = writable_file_->Fsync();
+  } else {
+    s = writable_file_->Sync();
+  }
+  SetPerfLevel(prev_perf_level);
+  return s;
+}
+
+Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+  IOSTATS_TIMER_GUARD(range_sync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+  return writable_file_->RangeSync(offset, nbytes);
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
+  Status s;
+  assert(!use_direct_io());
+  const char* src = data;
+  size_t left = size;
+
+  while (left > 0) {
+    size_t allowed;
+    if (rate_limiter_ != nullptr) {
+      allowed = rate_limiter_->RequestToken(
+          left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
+          RateLimiter::OpType::kWrite);
+    } else {
+      allowed = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::TimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize();
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+        old_size = next_write_offset_;
+      }
+#endif
+      {
+        auto prev_perf_level = GetPerfLevel();
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+        s = writable_file_->Append(Slice(src, allowed));
+        SetPerfLevel(prev_perf_level);
+      }
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+      }
+#endif
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, allowed);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
+
+    left -= allowed;
+    src += allowed;
+  }
+  buf_.Size(0);
+  return s;
+}
+
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+#ifndef ROCKSDB_LITE
+Status WritableFileWriter::WriteDirect() {
+  assert(use_direct_io());
+  Status s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  size_t file_advance =
+    TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write
+  // it again in the future either on Close() OR when the current whole page
+  // fills out
+  size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up and pad
+  buf_.PadToAlignmentWith(0);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+
+  while (left > 0) {
+    // Check how much is allowed
+    size_t size;
+    if (rate_limiter_ != nullptr) {
+      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
+                                         writable_file_->GetIOPriority(),
+                                         stats_, RateLimiter::OpType::kWrite);
+    } else {
+      size = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::TimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+      }
+      // direct writes must be positional
+      s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+      }
+      if (!s.ok()) {
+        buf_.Size(file_advance + leftover_tail);
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, size);
+    left -= size;
+    src += size;
+    write_offset += size;
+    assert((next_write_offset_ % alignment) == 0);
+  }
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close()
+    buf_.RefitTail(file_advance, leftover_tail);
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+
+namespace {
+class ReadaheadRandomAccessFile : public RandomAccessFile {
+ public:
+  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+                            size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+ ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+ ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete;
+
+ Status Read(uint64_t offset, size_t n, Slice* result,
+             char* scratch) const override {
+   if (n + alignment_ >= readahead_size_) {
+     return file_->Read(offset, n, result, scratch);
+   }
+
+   std::unique_lock<std::mutex> lk(lock_);
+
+   size_t cached_len = 0;
+   // Check if there is a cache hit, means that [offset, offset + n) is either
+   // completely or partially in the buffer
+   // If it's completely cached, including end of file case when offset + n is
+   // greater than EOF, return
+   if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+       (cached_len == n ||
+        // End of file
+        buffer_.CurrentSize() < readahead_size_)) {
+     *result = Slice(scratch, cached_len);
+     return Status::OK();
+   }
+   size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+   // In the case of cache hit advanced_offset is already aligned, means that
+   // chunk_offset equals to advanced_offset
+   size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+   Slice readahead_result;
+
+   Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+   if (s.ok()) {
+     // In the case of cache miss, i.e. when cached_len equals 0, an offset can
+     // exceed the file end position, so the following check is required
+     if (advanced_offset < chunk_offset + buffer_.CurrentSize()) {
+       // In the case of cache miss, the first chunk_padding bytes in buffer_
+       // are
+       // stored for alignment only and must be skipped
+       size_t chunk_padding = advanced_offset - chunk_offset;
+       auto remaining_len =
+           std::min(buffer_.CurrentSize() - chunk_padding, n - cached_len);
+       memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
+              remaining_len);
+       *result = Slice(scratch, cached_len + remaining_len);
+     } else {
+       *result = Slice(scratch, cached_len);
+     }
+   }
+   return s;
+ }
+
+ Status Prefetch(uint64_t offset, size_t n) override {
+   if (n < readahead_size_) {
+     // Don't allow smaller prefetches than the configured `readahead_size_`.
+     // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+     return Status::OK();
+   }
+   size_t offset_ = static_cast<size_t>(offset);
+   size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+   if (prefetch_offset == buffer_offset_) {
+     return Status::OK();
+   }
+   return ReadIntoBuffer(prefetch_offset,
+                         Roundup(offset_ + n, alignment_) - prefetch_offset);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+   return file_->GetUniqueId(id, max_size);
+ }
+
+ void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+ Status InvalidateCache(size_t offset, size_t length) override {
+   return file_->InvalidateCache(offset, length);
+ }
+
+ bool use_direct_io() const override { return file_->use_direct_io(); }
+
+private:
+ bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+                       char* scratch) const {
+   if (offset < buffer_offset_ ||
+       offset >= buffer_offset_ + buffer_.CurrentSize()) {
+     *cached_len = 0;
+     return false;
+   }
+   uint64_t offset_in_buffer = offset - buffer_offset_;
+   *cached_len = std::min(
+       buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+   memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+   return true;
+  }
+
+  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(offset, alignment_));
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
+    if (s.ok()) {
+      buffer_offset_ = offset;
+      buffer_.Size(result.size());
+      assert(buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  std::unique_ptr<RandomAccessFile> file_;
+  const size_t alignment_;
+  size_t               readahead_size_;
+
+  mutable std::mutex lock_;
+  mutable AlignedBuffer buffer_;
+  mutable uint64_t buffer_offset_;
+};
+}  // namespace
+
+Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
+                                    uint64_t offset, size_t n) {
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t offset_ = static_cast<size_t>(offset);
+  uint64_t rounddown_offset = Rounddown(offset_, alignment);
+  uint64_t roundup_end = Roundup(offset_ + n, alignment);
+  uint64_t roundup_len = roundup_end - rounddown_offset;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  // Check if requested bytes are in the existing buffer_.
+  // If all bytes exist -- return.
+  // If only a few bytes exist -- reuse them & read only what is really needed.
+  //     This is typically the case of incremental reading of data.
+  // If no bytes exist in buffer -- full pread.
+
+  Status s;
+  uint64_t chunk_offset_in_buffer = 0;
+  uint64_t chunk_len = 0;
+  bool copy_data_to_new_buffer = false;
+  if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
+      offset <= buffer_offset_ + buffer_.CurrentSize()) {
+    if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
+      // All requested bytes are already in the buffer. So no need to Read
+      // again.
+      return s;
+    } else {
+      // Only a few requested bytes are in the buffer. memmove those chunk of
+      // bytes to the beginning, and memcpy them back into the new buffer if a
+      // new buffer is created.
+      chunk_offset_in_buffer = Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
+      chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
+      assert(chunk_offset_in_buffer % alignment == 0);
+      assert(chunk_len % alignment == 0);
+      assert(chunk_offset_in_buffer + chunk_len <=
+             buffer_offset_ + buffer_.CurrentSize());
+      if (chunk_len > 0) {
+        copy_data_to_new_buffer = true;
+      } else {
+        // this reset is not necessary, but just to be safe.
+        chunk_offset_in_buffer = 0;
+      }
+    }
+  }
+
+  // Create a new buffer only if current capacity is not sufficient, and memcopy
+  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+  if (buffer_.Capacity() < roundup_len) {
+    buffer_.Alignment(alignment);
+    buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
+                              copy_data_to_new_buffer, chunk_offset_in_buffer,
+                              static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0) {
+    // New buffer not needed. But memmove bytes from tail to the beginning since
+    // chunk_len is greater than 0.
+    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer), static_cast<size_t>(chunk_len));
+  }
+
+  Slice result;
+  s = reader->Read(rounddown_offset + chunk_len,
+                   static_cast<size_t>(roundup_len - chunk_len), &result,
+                   buffer_.BufferStart() + chunk_len);
+  if (s.ok()) {
+    buffer_offset_ = rounddown_offset;
+    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
+  }
+  return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
+                                          Slice* result) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || offset < buffer_offset_) {
+    return false;
+  }
+
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
+    if (readahead_size_ > 0) {
+      assert(file_reader_ != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+
+      Status s = Prefetch(file_reader_, offset, n + readahead_size_);
+      if (!s.ok()) {
+        return false;
+      }
+      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+    } else {
+      return false;
+    }
+  }
+
+  uint64_t offset_in_buffer = offset - buffer_offset_;
+  *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
+  return true;
+}
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<RandomAccessFile> result(
+    new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+  return result;
+}
+
+Status NewWritableFile(Env* env, const std::string& fname,
+                       std::unique_ptr<WritableFile>* result,
+                       const EnvOptions& options) {
+  Status s = env->NewWritableFile(fname, result, options);
+  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  return s;
+}
+
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result) {
+  const int kBufferSize = 8192;
+  char buffer[kBufferSize + 1];
+  Slice input_slice;
+
+  std::string line;
+  bool has_complete_line = false;
+  while (!has_complete_line) {
+    if (std::getline(*iss, line)) {
+      has_complete_line = !iss->eof();
+    } else {
+      has_complete_line = false;
+    }
+    if (!has_complete_line) {
+      // if we're not sure whether we have a complete line,
+      // further read from the file.
+      if (*has_data) {
+        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
+      }
+      if (input_slice.size() == 0) {
+        // meaning we have read all the data
+        *has_data = false;
+        break;
+      } else {
+        iss->str(line + input_slice.ToString());
+        // reset the internal state of iss so that we can keep reading it.
+        iss->clear();
+        *has_data = (input_slice.size() == kBufferSize);
+        continue;
+      }
+    }
+  }
+  *output = line;
+  return *has_data || has_complete_line;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/file_reader_writer.h b/src/rocksdb/util/file_reader_writer.h
new file mode 100644
index 00000000..4451f8b8
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer.h
@@ -0,0 +1,326 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/aligned_buffer.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+class Statistics;
+class HistogramImpl;
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+  std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+
+class SequentialFileReader {
+ private:
+  std::unique_ptr<SequentialFile> file_;
+  std::string file_name_;
+  std::atomic<size_t> offset_;  // read offset
+
+ public:
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
+                                const std::string& _file_name)
+      : file_(std::move(_file)), file_name_(_file_name), offset_(0) {}
+
+  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    return *this;
+  }
+
+  SequentialFileReader(const SequentialFileReader&) = delete;
+  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch);
+
+  Status Skip(uint64_t n);
+
+  void Rewind();
+
+  SequentialFile* file() { return file_.get(); }
+
+  std::string file_name() { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+};
+
+class RandomAccessFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+                              const FileOperationInfo::TimePoint& start_ts,
+                              const FileOperationInfo::TimePoint& finish_ts,
+                              const Status& status) const {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::unique_ptr<RandomAccessFile> file_;
+  std::string     file_name_;
+  Env*            env_;
+  Statistics*     stats_;
+  uint32_t        hist_type_;
+  HistogramImpl*  file_read_hist_;
+  RateLimiter* rate_limiter_;
+  bool for_compaction_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+  explicit RandomAccessFileReader(
+      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
+      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr, bool for_compaction = false,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : file_(std::move(raf)),
+        file_name_(std::move(_file_name)),
+        env_(env),
+        stats_(stats),
+        hist_type_(hist_type),
+        file_read_hist_(file_read_hist),
+        rate_limiter_(rate_limiter),
+        for_compaction_(for_compaction),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
+      ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    env_ = std::move(o.env_);
+    stats_ = std::move(o.stats_);
+    hist_type_ = std::move(o.hist_type_);
+    file_read_hist_ = std::move(o.file_read_hist_);
+    rate_limiter_ = std::move(o.rate_limiter_);
+    for_compaction_ = std::move(o.for_compaction_);
+    return *this;
+  }
+
+  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const;
+
+  Status Prefetch(uint64_t offset, size_t n) const {
+    return file_->Prefetch(offset, n);
+  }
+
+  RandomAccessFile* file() { return file_.get(); }
+
+  std::string file_name() const { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+};
+
+// Use posix write to write data to a file.
+class WritableFileWriter {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+                               const FileOperationInfo::TimePoint& start_ts,
+                               const FileOperationInfo::TimePoint& finish_ts,
+                               const Status& status) {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::unique_ptr<WritableFile> writable_file_;
+  std::string file_name_;
+  Env* env_;
+  AlignedBuffer           buf_;
+  size_t                  max_buffer_size_;
+  // Actually written data size can be used for truncate
+  // not counting padding data
+  uint64_t                filesize_;
+#ifndef ROCKSDB_LITE
+  // This is necessary when we use unbuffered access
+  // and writes must happen on aligned offsets
+  // so we need to go back and write that page again
+  uint64_t                next_write_offset_;
+#endif  // ROCKSDB_LITE
+  bool                    pending_sync_;
+  uint64_t                last_sync_size_;
+  uint64_t                bytes_per_sync_;
+  RateLimiter*            rate_limiter_;
+  Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+  WritableFileWriter(
+      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
+      const EnvOptions& options, Env* env = nullptr,
+      Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : writable_file_(std::move(file)),
+        file_name_(_file_name),
+        env_(env),
+        buf_(),
+        max_buffer_size_(options.writable_file_max_buffer_size),
+        filesize_(0),
+#ifndef ROCKSDB_LITE
+        next_write_offset_(0),
+#endif  // ROCKSDB_LITE
+        pending_sync_(false),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter),
+        stats_(stats),
+        listeners_() {
+    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+                             reinterpret_cast<void*>(max_buffer_size_));
+    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  WritableFileWriter(const WritableFileWriter&) = delete;
+
+  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+  ~WritableFileWriter() { Close(); }
+
+  std::string file_name() const { return file_name_; }
+
+  Status Append(const Slice& data);
+
+  Status Pad(const size_t pad_bytes);
+
+  Status Flush();
+
+  Status Close();
+
+  Status Sync(bool use_fsync);
+
+  // Sync only the data that was already Flush()ed. Safe to call concurrently
+  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+  // returns NotSupported status.
+  Status SyncWithoutFlush(bool use_fsync);
+
+  uint64_t GetFileSize() { return filesize_; }
+
+  Status InvalidateCache(size_t offset, size_t length) {
+    return writable_file_->InvalidateCache(offset, length);
+  }
+
+  WritableFile* writable_file() const { return writable_file_.get(); }
+
+  bool use_direct_io() { return writable_file_->use_direct_io(); }
+
+  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
+ private:
+  // Used when os buffering is OFF and we are writing
+  // DMA such as in Direct I/O mode
+#ifndef ROCKSDB_LITE
+  Status WriteDirect();
+#endif  // !ROCKSDB_LITE
+  // Normal write
+  Status WriteBuffered(const char* data, size_t size);
+  Status RangeSync(uint64_t offset, uint64_t nbytes);
+  Status SyncInternal(bool use_fsync);
+};
+
+// FilePrefetchBuffer can automatically do the readahead if file_reader,
+// readahead_size, and max_readahead_size are passed in.
+// max_readahead_size should be greater than or equal to readahead_size.
+// readahead_size will be doubled on every IO, until max_readahead_size.
+class FilePrefetchBuffer {
+ public:
+  // If `track_min_offset` is true, track minimum offset ever read.
+  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
+                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false)
+      : buffer_offset_(0),
+        file_reader_(file_reader),
+        readahead_size_(readadhead_size),
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(port::kMaxSizet),
+        enable_(enable),
+        track_min_offset_(track_min_offset) {}
+  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
+  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result);
+
+  // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked
+  // if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
+
+ private:
+  AlignedBuffer buffer_;
+  uint64_t buffer_offset_;
+  RandomAccessFileReader* file_reader_;
+  size_t readahead_size_;
+  size_t max_readahead_size_;
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
+};
+
+extern Status NewWritableFile(Env* env, const std::string& fname,
+                              std::unique_ptr<WritableFile>* result,
+                              const EnvOptions& options);
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/file_reader_writer_test.cc b/src/rocksdb/util/file_reader_writer_test.cc
new file mode 100644
index 00000000..6a7ea6d7
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer_test.cc
@@ -0,0 +1,330 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "util/file_reader_writer.h"
+#include <algorithm>
+#include <vector>
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class WritableFileWriterTest : public testing::Test {};
+
+const uint32_t kMb = 1 << 20;
+
+TEST_F(WritableFileWriterTest, RangeSync) {
+  class FakeWF : public WritableFile {
+   public:
+    explicit FakeWF() : size_(0), last_synced_(0) {}
+    ~FakeWF() override {}
+
+    Status Append(const Slice& data) override {
+      size_ += data.size();
+      return Status::OK();
+    }
+    Status Truncate(uint64_t /*size*/) override { return Status::OK(); }
+    Status Close() override {
+      EXPECT_GE(size_, last_synced_ + kMb);
+      EXPECT_LT(size_, last_synced_ + 2 * kMb);
+      // Make sure random writes generated enough writes.
+      EXPECT_GT(size_, 10 * kMb);
+      return Status::OK();
+    }
+    Status Flush() override { return Status::OK(); }
+    Status Sync() override { return Status::OK(); }
+    Status Fsync() override { return Status::OK(); }
+    void SetIOPriority(Env::IOPriority /*pri*/) override {}
+    uint64_t GetFileSize() override { return size_; }
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {}
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      return 0;
+    }
+    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return Status::OK();
+    }
+
+   protected:
+    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
+      return Status::OK();
+    }
+    Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+      EXPECT_EQ(offset % 4096, 0u);
+      EXPECT_EQ(nbytes % 4096, 0u);
+
+      EXPECT_EQ(offset, last_synced_);
+      last_synced_ = offset + nbytes;
+      EXPECT_GE(size_, last_synced_ + kMb);
+      if (size_ > 2 * kMb) {
+        EXPECT_LT(size_, last_synced_ + 2 * kMb);
+      }
+      return Status::OK();
+    }
+
+    uint64_t size_;
+    uint64_t last_synced_;
+  };
+
+  EnvOptions env_options;
+  env_options.bytes_per_sync = kMb;
+  std::unique_ptr<FakeWF> wf(new FakeWF);
+  std::unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
+  Random r(301);
+  std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
+  for (int i = 0; i < 1000; i++) {
+    int skew_limit = (i < 700) ? 10 : 15;
+    uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
+    writer->Append(Slice(large_buf.get(), num));
+
+    // Flush in a chance of 1/10.
+    if (r.Uniform(10) == 0) {
+      writer->Flush();
+    }
+  }
+  writer->Close();
+}
+
+TEST_F(WritableFileWriterTest, IncrementalBuffer) {
+  class FakeWF : public WritableFile {
+   public:
+    explicit FakeWF(std::string* _file_data, bool _use_direct_io,
+                    bool _no_flush)
+        : file_data_(_file_data),
+          use_direct_io_(_use_direct_io),
+          no_flush_(_no_flush) {}
+    ~FakeWF() override {}
+
+    Status Append(const Slice& data) override {
+      file_data_->append(data.data(), data.size());
+      size_ += data.size();
+      return Status::OK();
+    }
+    Status PositionedAppend(const Slice& data, uint64_t pos) override {
+      EXPECT_TRUE(pos % 512 == 0);
+      EXPECT_TRUE(data.size() % 512 == 0);
+      file_data_->resize(pos);
+      file_data_->append(data.data(), data.size());
+      size_ += data.size();
+      return Status::OK();
+    }
+
+    Status Truncate(uint64_t size) override {
+      file_data_->resize(size);
+      return Status::OK();
+    }
+    Status Close() override { return Status::OK(); }
+    Status Flush() override { return Status::OK(); }
+    Status Sync() override { return Status::OK(); }
+    Status Fsync() override { return Status::OK(); }
+    void SetIOPriority(Env::IOPriority /*pri*/) override {}
+    uint64_t GetFileSize() override { return size_; }
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {}
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      return 0;
+    }
+    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return Status::OK();
+    }
+    bool use_direct_io() const override { return use_direct_io_; }
+
+    std::string* file_data_;
+    bool use_direct_io_;
+    bool no_flush_;
+    size_t size_ = 0;
+  };
+
+  Random r(301);
+  const int kNumAttempts = 50;
+  for (int attempt = 0; attempt < kNumAttempts; attempt++) {
+    bool no_flush = (attempt % 3 == 0);
+    EnvOptions env_options;
+    env_options.writable_file_max_buffer_size =
+        (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024;
+    std::string actual;
+    std::unique_ptr<FakeWF> wf(new FakeWF(&actual,
+#ifndef ROCKSDB_LITE
+                                          attempt % 2 == 1,
+#else
+                                          false,
+#endif
+                                          no_flush));
+    std::unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
+        std::move(wf), "" /* don't care */, env_options));
+
+    std::string target;
+    for (int i = 0; i < 20; i++) {
+      uint32_t num = r.Skewed(16) * 100 + r.Uniform(100);
+      std::string random_string;
+      test::RandomString(&r, num, &random_string);
+      writer->Append(Slice(random_string.c_str(), num));
+      target.append(random_string.c_str(), num);
+
+      // In some attempts, flush in a chance of 1/10.
+      if (!no_flush && r.Uniform(10) == 0) {
+        writer->Flush();
+      }
+    }
+    writer->Flush();
+    writer->Close();
+    ASSERT_EQ(target.size(), actual.size());
+    ASSERT_EQ(target, actual);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WritableFileWriterTest, AppendStatusReturn) {
+  class FakeWF : public WritableFile {
+   public:
+    explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
+
+    bool use_direct_io() const override { return use_direct_io_; }
+    Status Append(const Slice& /*data*/) override {
+      if (io_error_) {
+        return Status::IOError("Fake IO error");
+      }
+      return Status::OK();
+    }
+    Status PositionedAppend(const Slice& /*data*/, uint64_t) override {
+      if (io_error_) {
+        return Status::IOError("Fake IO error");
+      }
+      return Status::OK();
+    }
+    Status Close() override { return Status::OK(); }
+    Status Flush() override { return Status::OK(); }
+    Status Sync() override { return Status::OK(); }
+    void Setuse_direct_io(bool val) { use_direct_io_ = val; }
+    void SetIOError(bool val) { io_error_ = val; }
+
+   protected:
+    bool use_direct_io_;
+    bool io_error_;
+  };
+  std::unique_ptr<FakeWF> wf(new FakeWF());
+  wf->Setuse_direct_io(true);
+  std::unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
+
+  ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
+
+  // Next call to WritableFile::Append() should fail
+  dynamic_cast<FakeWF*>(writer->writable_file())->SetIOError(true);
+  ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b')));
+}
+#endif
+
+class ReadaheadRandomAccessFileTest
+    : public testing::Test,
+      public testing::WithParamInterface<size_t> {
+ public:
+  static std::vector<size_t> GetReadaheadSizeList() {
+    return {1lu << 12, 1lu << 16};
+  }
+  void SetUp() override {
+    readahead_size_ = GetParam();
+    scratch_.reset(new char[2 * readahead_size_]);
+    ResetSourceStr();
+  }
+  ReadaheadRandomAccessFileTest() : control_contents_() {}
+  std::string Read(uint64_t offset, size_t n) {
+    Slice result;
+    test_read_holder_->Read(offset, n, &result, scratch_.get());
+    return std::string(result.data(), result.size());
+  }
+  void ResetSourceStr(const std::string& str = "") {
+    auto write_holder =
+        std::unique_ptr<WritableFileWriter>(test::GetWritableFileWriter(
+            new test::StringSink(&control_contents_), "" /* don't care */));
+    write_holder->Append(Slice(str));
+    write_holder->Flush();
+    auto read_holder = std::unique_ptr<RandomAccessFile>(
+        new test::StringSource(control_contents_));
+    test_read_holder_ =
+        NewReadaheadRandomAccessFile(std::move(read_holder), readahead_size_);
+  }
+  size_t GetReadaheadSize() const { return readahead_size_; }
+
+ private:
+  size_t readahead_size_;
+  Slice control_contents_;
+  std::unique_ptr<RandomAccessFile> test_read_holder_;
+  std::unique_ptr<char[]> scratch_;
+};
+
+TEST_P(ReadaheadRandomAccessFileTest, EmptySourceStrTest) {
+  ASSERT_EQ("", Read(0, 1));
+  ASSERT_EQ("", Read(0, 0));
+  ASSERT_EQ("", Read(13, 13));
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSizeTest) {
+  std::string str = "abcdefghijklmnopqrs";
+  ResetSourceStr(str);
+  ASSERT_EQ(str.substr(3, 4), Read(3, 4));
+  ASSERT_EQ(str.substr(0, 3), Read(0, 3));
+  ASSERT_EQ(str, Read(0, str.size()));
+  ASSERT_EQ(str.substr(7, std::min(static_cast<int>(str.size()) - 7, 30)),
+            Read(7, 30));
+  ASSERT_EQ("", Read(100, 100));
+}
+
+TEST_P(ReadaheadRandomAccessFileTest,
+       SourceStrLenCanBeGreaterThanReadaheadSizeTest) {
+  Random rng(42);
+  for (int k = 0; k < 100; ++k) {
+    size_t strLen = k * GetReadaheadSize() +
+                    rng.Uniform(static_cast<int>(GetReadaheadSize()));
+    std::string str =
+        test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+    ResetSourceStr(str);
+    for (int test = 1; test <= 100; ++test) {
+      size_t offset = rng.Uniform(static_cast<int>(strLen));
+      size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)),
+                Read(offset, n));
+    }
+  }
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) {
+  Random rng(7);
+  size_t strLen = 4 * GetReadaheadSize() +
+                  rng.Uniform(static_cast<int>(GetReadaheadSize()));
+  std::string str =
+      test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+  ResetSourceStr(str);
+  for (int test = 1; test <= 100; ++test) {
+    size_t offset = rng.Uniform(static_cast<int>(strLen));
+    size_t n =
+        GetReadaheadSize() + rng.Uniform(static_cast<int>(GetReadaheadSize()));
+    ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)),
+              Read(offset, n));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    EmptySourceStrTest, ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenLessThanReadaheadSizeTest, ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenCanBeGreaterThanReadaheadSizeTest,
+    ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    NExceedReadaheadTest, ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/file_util.cc b/src/rocksdb/util/file_util.cc
new file mode 100644
index 00000000..ba1b4744
--- /dev/null
+++ b/src/rocksdb/util/file_util.cc
@@ -0,0 +1,110 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "util/file_util.h"
+
+#include <string>
+#include <algorithm>
+
+#include "rocksdb/env.h"
+#include "util/sst_file_manager_impl.h"
+#include "util/file_reader_writer.h"
+
+namespace rocksdb {
+
+// Utility function to copy a file up to a specified length
+Status CopyFile(Env* env, const std::string& source,
+                const std::string& destination, uint64_t size, bool use_fsync) {
+  const EnvOptions soptions;
+  Status s;
+  std::unique_ptr<SequentialFileReader> src_reader;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  {
+    std::unique_ptr<SequentialFile> srcfile;
+    s = env->NewSequentialFile(source, &srcfile, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+    std::unique_ptr<WritableFile> destfile;
+    s = env->NewWritableFile(destination, &destfile, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (size == 0) {
+      // default argument means copy everything
+      s = env->GetFileSize(source, &size);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    src_reader.reset(new SequentialFileReader(std::move(srcfile), source));
+    dest_writer.reset(
+        new WritableFileWriter(std::move(destfile), destination, soptions));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
+    s = src_reader->Read(bytes_to_read, &slice, buffer);
+    if (!s.ok()) {
+      return s;
+    }
+    if (slice.size() == 0) {
+      return Status::Corruption("file too small");
+    }
+    s = dest_writer->Append(slice);
+    if (!s.ok()) {
+      return s;
+    }
+    size -= slice.size();
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+// Utility function to create a file with the provided contents
+Status CreateFile(Env* env, const std::string& destination,
+                  const std::string& contents, bool use_fsync) {
+  const EnvOptions soptions;
+  Status s;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  std::unique_ptr<WritableFile> destfile;
+  s = env->NewWritableFile(destination, &destfile, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+  dest_writer.reset(
+      new WritableFileWriter(std::move(destfile), destination, soptions));
+  s = dest_writer->Append(Slice(contents));
+  if (!s.ok()) {
+    return s;
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                     const std::string& fname, const std::string& dir_to_sync,
+                     const bool force_bg) {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
+  if (sfm) {
+    return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
+  } else {
+    return db_options->env->DeleteFile(fname);
+  }
+#else
+  (void)dir_to_sync;
+  (void)force_bg;
+  // SstFileManager is not supported in ROCKSDB_LITE
+  // Delete file immediately
+  return db_options->env->DeleteFile(fname);
+#endif
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/file_util.h b/src/rocksdb/util/file_util.h
new file mode 100644
index 00000000..c3b365c8
--- /dev/null
+++ b/src/rocksdb/util/file_util.h
@@ -0,0 +1,30 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <string>
+
+#include "options/db_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "util/filename.h"
+
+namespace rocksdb {
+// use_fsync maps to options.use_fsync, which determines the way that
+// the file is synced after copying.
+extern Status CopyFile(Env* env, const std::string& source,
+                       const std::string& destination, uint64_t size,
+                       bool use_fsync);
+
+extern Status CreateFile(Env* env, const std::string& destination,
+                         const std::string& contents, bool use_fsync);
+
+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                           const std::string& fname,
+                           const std::string& path_to_sync,
+                           const bool force_bg = false);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/filelock_test.cc b/src/rocksdb/util/filelock_test.cc
new file mode 100644
index 00000000..f8721b59
--- /dev/null
+++ b/src/rocksdb/util/filelock_test.cc
@@ -0,0 +1,141 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "rocksdb/status.h"
+#include "rocksdb/env.h"
+
+#include <vector>
+#include <fcntl.h>
+#include "util/coding.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class LockTest : public testing::Test {
+ public:
+  static LockTest* current_;
+  std::string file_;
+  rocksdb::Env* env_;
+
+  LockTest()
+      : file_(test::PerThreadDBPath("db_testlock_file")),
+        env_(rocksdb::Env::Default()) {
+    current_ = this;
+  }
+
+  ~LockTest() override {}
+
+  Status LockFile(FileLock** db_lock) {
+    return env_->LockFile(file_, db_lock);
+  }
+
+  Status UnlockFile(FileLock* db_lock) {
+    return env_->UnlockFile(db_lock);
+  }
+
+  bool AssertFileIsLocked(){
+    return CheckFileLock( /* lock_expected = */ true);
+  }
+
+  bool AssertFileIsNotLocked(){
+    return CheckFileLock( /* lock_expected = */ false);
+  }
+
+  bool CheckFileLock(bool lock_expected){
+    // We need to fork to check the fcntl lock as we need
+    // to open and close the file from a different process
+    // to avoid either releasing the lock on close, or not
+    // contending for it when requesting a lock.
+
+#ifdef OS_WIN
+
+    // WaitForSingleObject and GetExitCodeProcess can do what waitpid does.
+    // TODO - implement on Windows
+    return true;
+
+#else
+
+    pid_t pid = fork();
+    if ( 0 == pid ) {
+      // child process
+      int exit_val = EXIT_FAILURE;
+      int fd = open(file_.c_str(), O_RDWR | O_CREAT, 0644);
+      if (fd < 0) {
+        // could not open file, could not check if it was locked
+        fprintf( stderr, "Open on on file %s failed.\n",file_.c_str());
+        exit(exit_val);
+      }
+
+      struct flock f;
+      memset(&f, 0, sizeof(f));
+      f.l_type = (F_WRLCK);
+      f.l_whence = SEEK_SET;
+      f.l_start = 0;
+      f.l_len = 0; // Lock/unlock entire file
+      int value = fcntl(fd, F_SETLK, &f);
+      if( value == -1 ){
+        if( lock_expected ){
+          exit_val = EXIT_SUCCESS;
+        }
+      } else {
+        if( ! lock_expected ){
+          exit_val = EXIT_SUCCESS;
+        }
+      }
+      close(fd); // lock is released for child process
+      exit(exit_val);
+    } else if (pid > 0) {
+      // parent process
+      int status;
+      while (-1 == waitpid(pid, &status, 0));
+      if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+        // child process exited with non success status
+        return false;
+      } else {
+        return true;
+      }
+    } else {
+      fprintf( stderr, "Fork failed\n" );
+      return false;
+    }
+    return false;
+
+#endif
+
+  }
+
+};
+LockTest* LockTest::current_;
+
+TEST_F(LockTest, LockBySameThread) {
+  FileLock* lock1;
+  FileLock* lock2;
+
+  // acquire a lock on a file
+  ASSERT_OK(LockFile(&lock1));
+
+  // check the file is locked
+  ASSERT_TRUE( AssertFileIsLocked() );
+
+  // re-acquire the lock on the same file. This should fail.
+  ASSERT_TRUE(LockFile(&lock2).IsIOError());
+
+  // check the file is locked
+  ASSERT_TRUE( AssertFileIsLocked() );
+
+  // release the lock
+  ASSERT_OK(UnlockFile(lock1));
+
+  // check the file is not locked
+  ASSERT_TRUE( AssertFileIsNotLocked() );
+
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/filename.cc b/src/rocksdb/util/filename.cc
new file mode 100644
index 00000000..32289aec
--- /dev/null
+++ b/src/rocksdb/util/filename.cc
@@ -0,0 +1,410 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "util/filename.h"
+#include <inttypes.h>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <vector>
+#include "rocksdb/env.h"
+#include "util/file_reader_writer.h"
+#include "util/logging.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+static const std::string kRocksDbTFileExt = "sst";
+static const std::string kLevelDbTFileExt = "ldb";
+static const std::string kRocksDBBlobFileExt = "blob";
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
+  const char suffix[] = "_LOG";
+
+  size_t write_idx = 0;
+  size_t i = 0;
+  size_t src_len = path.size();
+
+  while (i < src_len && write_idx < len - sizeof(suffix)) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') ||
+        path[i] == '-' ||
+        path[i] == '.' ||
+        path[i] == '_'){
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0) {
+        dest[write_idx++] = '_';
+      }
+    }
+    i++;
+  }
+  assert(sizeof(suffix) <= len - write_idx);
+  // "\0" is automatically added by snprintf
+  snprintf(dest + write_idx, len - write_idx, suffix);
+  write_idx += sizeof(suffix) - 1;
+  return write_idx;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/%06llu.%s",
+           static_cast<unsigned long long>(number),
+           suffix);
+  return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
+}
+
+std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
+                         uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname + "/" + blob_dir, number,
+                      kRocksDBBlobFileExt.c_str());
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + ARCHIVAL_DIR;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+}
+
+std::string MakeTableFileName(const std::string& path, uint64_t number) {
+  return MakeFileName(path, number, kRocksDbTFileExt.c_str());
+}
+
+std::string Rocks2LevelTableFileName(const std::string& fullname) {
+  assert(fullname.size() > kRocksDbTFileExt.size() + 1);
+  if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
+    return "";
+  }
+  return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
+         kLevelDbTFileExt;
+}
+
+uint64_t TableFileNameToNumber(const std::string& name) {
+  uint64_t number = 0;
+  uint64_t base = 1;
+  int pos = static_cast<int>(name.find_last_of('.'));
+  while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+    number += (name[pos] - '0') * base;
+    base *= 10;
+  }
+  return number;
+}
+
+std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
+                          uint32_t path_id) {
+  assert(number > 0);
+  std::string path;
+  if (path_id >= db_paths.size()) {
+    path = db_paths.back().path;
+  } else {
+    path = db_paths[path_id].path;
+  }
+  return MakeTableFileName(path, number);
+}
+
+void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                      size_t out_buf_size) {
+  if (path_id == 0) {
+    snprintf(out_buf, out_buf_size, "%" PRIu64, number);
+  } else {
+    snprintf(out_buf, out_buf_size, "%" PRIu64
+                                    "(path "
+                                    "%" PRIu32 ")",
+             number, path_id);
+  }
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  return MakeFileName(dbname, number, kTempFileNameSuffix.c_str());
+}
+
+InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
+                             const std::string& db_absolute_path) {
+  if (!has_log_dir) {
+    const char kInfoLogPrefix[] = "LOG";
+    // "\0" is automatically added to the end
+    snprintf(buf, sizeof(buf), kInfoLogPrefix);
+    prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
+  } else {
+    size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf));
+    prefix = Slice(buf, len);
+  }
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path, const std::string& log_dir) {
+  if (log_dir.empty()) {
+    return dbname + "/LOG";
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf;
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path, const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty()) {
+    return dbname + "/LOG.old." + buf;
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
+}
+
+std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64,
+           kOptionsFileNamePrefix.c_str(), file_num);
+  return dbname + "/" + buffer;
+}
+
+std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s",
+           kOptionsFileNamePrefix.c_str(), file_num,
+           kTempFileNameSuffix.c_str());
+  return dbname + "/" + buffer;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/<info_log_name_prefix>
+//    dbname/<info_log_name_prefix>.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst|blob)
+//    dbname/METADB-[0-9]+
+//    dbname/OPTIONS-[0-9]+
+//    dbname/OPTIONS-[0-9]+.dbtmp
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   FileType* type,
+                   WalFileType* log_type) {
+  return ParseFileName(fname, number, "", type, log_type);
+}
+
+bool ParseFileName(const std::string& fname, uint64_t* number,
+                   const Slice& info_log_name_prefix, FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (info_log_name_prefix.size() > 0 &&
+             rest.starts_with(info_log_name_prefix)) {
+    rest.remove_prefix(info_log_name_prefix.size());
+    if (rest == "" || rest == ".old") {
+      *number = 0;
+      *type = kInfoLogFile;
+    } else if (rest.starts_with(".old.")) {
+      uint64_t ts_suffix;
+      // sizeof also counts the trailing '\0'.
+      rest.remove_prefix(sizeof(".old.") - 1);
+      if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+        return false;
+      }
+      *number = ts_suffix;
+      *type = kInfoLogFile;
+    }
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else if (rest.starts_with(kOptionsFileNamePrefix)) {
+    uint64_t ts_suffix;
+    bool is_temp_file = false;
+    rest.remove_prefix(kOptionsFileNamePrefix.size());
+    const std::string kTempFileNameSuffixWithDot =
+        std::string(".") + kTempFileNameSuffix;
+    if (rest.ends_with(kTempFileNameSuffixWithDot)) {
+      rest.remove_suffix(kTempFileNameSuffixWithDot.size());
+      is_temp_file = true;
+    }
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = is_temp_file ? kTempFile : kOptionsFile;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(ARCHIVAL_DIR)) {
+      if (rest.size() <= ARCHIVAL_DIR.size()) {
+        return false;
+      }
+      rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (rest.size() <= 1 || rest[0] != '.') {
+      return false;
+    }
+    rest.remove_prefix(1);
+
+    Slice suffix = rest;
+    if (suffix == Slice("log")) {
+      *type = kLogFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false; // Archive dir can contain only log files
+    } else if (suffix == Slice(kRocksDbTFileExt) ||
+               suffix == Slice(kLevelDbTFileExt)) {
+      *type = kTableFile;
+    } else if (suffix == Slice(kRocksDBBlobFileExt)) {
+      *type = kBlobFile;
+    } else if (suffix == Slice(kTempFileNameSuffix)) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number,
+                      Directory* directory_to_fsync) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
+  if (s.ok()) {
+    TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+    TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2);
+  }
+  if (s.ok()) {
+    if (directory_to_fsync != nullptr) {
+      s = directory_to_fsync->Fsync();
+    }
+  } else {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname) {
+  std::string id = env->GenerateUniqueId();
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  Status s = WriteStringToFile(env, id, tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, IdentityFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+                    WritableFileWriter* file) {
+  TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+  return file->Sync(db_options->use_fsync);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/filename.h b/src/rocksdb/util/filename.h
new file mode 100644
index 00000000..eea6b1b0
--- /dev/null
+++ b/src/rocksdb/util/filename.h
@@ -0,0 +1,172 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+
+namespace rocksdb {
+
+class Env;
+class Directory;
+class WritableFileWriter;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile,
+  kOptionsFile,
+  kBlobFile
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
+
+extern std::string BlobFileName(const std::string& dbname,
+                                const std::string& blob_dir, uint64_t number);
+
+static const std::string ARCHIVAL_DIR = "archive";
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname,
+                                       uint64_t num);
+
+extern std::string MakeTableFileName(const std::string& name, uint64_t number);
+
+// Return the name of sstable with LevelDB suffix
+// created from RocksDB sstable suffixed name
+extern std::string Rocks2LevelTableFileName(const std::string& fullname);
+
+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::vector<DbPath>& db_paths,
+                                 uint64_t number, uint32_t path_id);
+
+// Sufficient buffer size for FormatFileNumber.
+const size_t kFormatFileNumberBufSize = 38;
+
+extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                             size_t out_buf_size);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// A helper structure for prefix of info log names.
+struct InfoLogPrefix {
+  char buf[260];
+  Slice prefix;
+  // Prefix with DB absolute path encoded
+  explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path);
+  // Default Prefix
+  explicit InfoLogPrefix();
+};
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+                                   const std::string& db_path = "",
+                                   const std::string& log_dir = "");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+                                      const std::string& db_path = "",
+                                      const std::string& log_dir = "");
+
+static const std::string kOptionsFileNamePrefix = "OPTIONS-";
+static const std::string kTempFileNameSuffix = "dbtmp";
+
+// Return a options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number].dbtmp
+extern std::string OptionsFileName(const std::string& dbname,
+                                   uint64_t file_num);
+
+// Return a temp options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number]
+extern std::string TempOptionsFileName(const std::string& dbname,
+                                       uint64_t file_num);
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname,
+                                    uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+// info_log_name_prefix is the path of info logs.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          const Slice& info_log_name_prefix, FileType* type,
+                          WalFileType* log_type = nullptr);
+// Same as previous function, but skip info log files.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          FileType* type, WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number,
+                             Directory* directory_to_fsync);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname);
+
+// Sync manifest file `file`.
+extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+                           WritableFileWriter* file);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/filter_policy.cc b/src/rocksdb/util/filter_policy.cc
new file mode 100644
index 00000000..efb9bf47
--- /dev/null
+++ b/src/rocksdb/util/filter_policy.cc
@@ -0,0 +1,16 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+namespace rocksdb {
+
+FilterPolicy::~FilterPolicy() { }
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/gflags_compat.h b/src/rocksdb/util/gflags_compat.h
new file mode 100644
index 00000000..0ea3aef5
--- /dev/null
+++ b/src/rocksdb/util/gflags_compat.h
@@ -0,0 +1,12 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <gflags/gflags.h>
+
+#ifndef GFLAGS_NAMESPACE
+// in case it's not defined in old versions, that's probably because it was
+// still google by default.
+#define GFLAGS_NAMESPACE google
+#endif
diff --git a/src/rocksdb/util/hash.cc b/src/rocksdb/util/hash.cc
new file mode 100644
index 00000000..852710d7
--- /dev/null
+++ b/src/rocksdb/util/hash.cc
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string.h>
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/util.h"
+
+namespace rocksdb {
+
+uint32_t Hash(const char* data, size_t n, uint32_t seed) {
+  // Similar to murmur hash
+  const uint32_t m = 0xc6a4a793;
+  const uint32_t r = 24;
+  const char* limit = data + n;
+  uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
+
+  // Pick up four bytes at a time
+  while (data + 4 <= limit) {
+    uint32_t w = DecodeFixed32(data);
+    data += 4;
+    h += w;
+    h *= m;
+    h ^= (h >> 16);
+  }
+
+  // Pick up remaining bytes
+  switch (limit - data) {
+    // Note: The original hash implementation used data[i] << shift, which
+    // promotes the char to int and then performs the shift. If the char is
+    // negative, the shift is undefined behavior in C++. The hash algorithm is
+    // part of the format definition, so we cannot change it; to obtain the same
+    // behavior in a legal way we just cast to uint32_t, which will do
+    // sign-extension. To guarantee compatibility with architectures where chars
+    // are unsigned we first cast the char to int8_t.
+    case 3:
+      h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
+      FALLTHROUGH_INTENDED;
+    case 2:
+      h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
+      FALLTHROUGH_INTENDED;
+    case 1:
+      h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
+      h *= m;
+      h ^= (h >> r);
+      break;
+  }
+  return h;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/hash.h b/src/rocksdb/util/hash.h
new file mode 100644
index 00000000..ed42b089
--- /dev/null
+++ b/src/rocksdb/util/hash.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Simple hash function used for internal data structures
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+#include "util/murmurhash.h"
+
+namespace rocksdb {
+
+// Non-persistent hash. Only used for in-memory data structure
+// The hash results are applicable to change.
+extern uint64_t NPHash64(const char* data, size_t n, uint32_t seed);
+
+extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
+
+inline uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+
+inline uint64_t GetSliceNPHash64(const Slice& s) {
+  return NPHash64(s.data(), s.size(), 0);
+}
+
+inline uint32_t GetSliceHash(const Slice& s) {
+  return Hash(s.data(), s.size(), 397);
+}
+
+inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
+  // Right now murmurhash2B is used. It should able to be freely
+  // changed to a better hash, without worrying about backward
+  // compatibility issue.
+  return MURMUR_HASH(data, static_cast<int>(n),
+                     static_cast<unsigned int>(seed));
+}
+
+// std::hash compatible interface.
+struct SliceHasher {
+  uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/hash_map.h b/src/rocksdb/util/hash_map.h
new file mode 100644
index 00000000..7b08fb39
--- /dev/null
+++ b/src/rocksdb/util/hash_map.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <utility>
+
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+// This is similar to std::unordered_map, except that it tries to avoid
+// allocating or deallocating memory as much as possible. With
+// std::unordered_map, an allocation/deallocation is made for every insertion
+// or deletion because of the requirement that iterators remain valid even
+// with insertions or deletions. This means that the hash chains will be
+// implemented as linked lists.
+//
+// This implementation uses autovector as hash chains insteads.
+//
+template <typename K, typename V, size_t size = 128>
+class HashMap {
+  std::array<autovector<std::pair<K, V>, 1>, size> table_;
+
+ public:
+  bool Contains(K key) {
+    auto& bucket = table_[key % size];
+    auto it = std::find_if(
+        bucket.begin(), bucket.end(),
+        [key](const std::pair<K, V>& p) { return p.first == key; });
+    return it != bucket.end();
+  }
+
+  void Insert(K key, V value) {
+    auto& bucket = table_[key % size];
+    bucket.push_back({key, value});
+  }
+
+  void Delete(K key) {
+    auto& bucket = table_[key % size];
+    auto it = std::find_if(
+        bucket.begin(), bucket.end(),
+        [key](const std::pair<K, V>& p) { return p.first == key; });
+    if (it != bucket.end()) {
+      auto last = bucket.end() - 1;
+      if (it != last) {
+        *it = *last;
+      }
+      bucket.pop_back();
+    }
+  }
+
+  V& Get(K key) {
+    auto& bucket = table_[key % size];
+    auto it = std::find_if(
+        bucket.begin(), bucket.end(),
+        [key](const std::pair<K, V>& p) { return p.first == key; });
+    return it->second;
+  }
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/hash_test.cc b/src/rocksdb/util/hash_test.cc
new file mode 100644
index 00000000..959e8cd0
--- /dev/null
+++ b/src/rocksdb/util/hash_test.cc
@@ -0,0 +1,73 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <vector>
+
+#include "util/hash.h"
+#include "util/testharness.h"
+
+// The hash algorithm is part of the file format, for example for the Bloom
+// filters. Test that the hash values are stable for a set of random strings of
+// varying lengths.
+TEST(HashTest, Values) {
+  using rocksdb::Hash;
+  constexpr uint32_t kSeed = 0xbc9f1d34;  // Same as BloomHash.
+
+  EXPECT_EQ(Hash("", 0, kSeed), 3164544308);
+  EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524);
+  EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998);
+  EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349);
+  EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383);
+  EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956);
+  EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105);
+  EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776);
+  EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603);
+  EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797);
+  EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265);
+  EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661);
+  EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222);
+  EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265);
+  EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611);
+  EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572);
+  EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355);
+  EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370);
+  EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402);
+  EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743);
+  EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500);
+  EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008);
+  EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482);
+  EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422);
+  EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029);
+  EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748);
+  EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698);
+  EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227);
+  EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252);
+  EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402);
+  EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698);
+  EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809);
+  EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120);
+  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434);
+  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151);
+  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696);
+  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912);
+  EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
+            3420325137);
+  EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
+            3427803584);
+  EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
+            1152407945);
+  EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
+            3382479516);
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/heap.h b/src/rocksdb/util/heap.h
new file mode 100644
index 00000000..6093c20e
--- /dev/null
+++ b/src/rocksdb/util/heap.h
@@ -0,0 +1,166 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+// Binary heap implementation optimized for use in multi-way merge sort.
+// Comparison to std::priority_queue:
+// - In libstdc++, std::priority_queue::pop() usually performs just over logN
+//   comparisons but never fewer.
+// - std::priority_queue does not have a replace-top operation, requiring a
+//   pop+push.  If the replacement element is the new top, this requires
+//   around 2logN comparisons.
+// - This heap's pop() uses a "schoolbook" downheap which requires up to ~2logN
+//   comparisons.
+// - This heap provides a replace_top() operation which requires [1, 2logN]
+//   comparisons.  When the replacement element is also the new top, this
+//   takes just 1 or 2 comparisons.
+//
+// The last property can yield an order-of-magnitude performance improvement
+// when merge-sorting real-world non-random data.  If the merge operation is
+// likely to take chunks of elements from the same input stream, only 1
+// comparison per element is needed.  In RocksDB-land, this happens when
+// compacting a database where keys are not randomly distributed across L0
+// files but nearby keys are likely to be in the same L0 file.
+//
+// The container uses the same counterintuitive ordering as
+// std::priority_queue: the comparison operator is expected to provide the
+// less-than relation, but top() will return the maximum.
+
+template<typename T, typename Compare = std::less<T>>
+class BinaryHeap {
+ public:
+  BinaryHeap() { }
+  explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) { }
+
+  void push(const T& value) {
+    data_.push_back(value);
+    upheap(data_.size() - 1);
+  }
+
+  void push(T&& value) {
+    data_.push_back(std::move(value));
+    upheap(data_.size() - 1);
+  }
+
+  const T& top() const {
+    assert(!empty());
+    return data_.front();
+  }
+
+  void replace_top(const T& value) {
+    assert(!empty());
+    data_.front() = value;
+    downheap(get_root());
+  }
+
+  void replace_top(T&& value) {
+    assert(!empty());
+    data_.front() = std::move(value);
+    downheap(get_root());
+  }
+
+  void pop() {
+    assert(!empty());
+    data_.front() = std::move(data_.back());
+    data_.pop_back();
+    if (!empty()) {
+      downheap(get_root());
+    } else {
+      reset_root_cmp_cache();
+    }
+  }
+
+  void swap(BinaryHeap &other) {
+    std::swap(cmp_, other.cmp_);
+    data_.swap(other.data_);
+    std::swap(root_cmp_cache_, other.root_cmp_cache_);
+  }
+
+  void clear() {
+    data_.clear();
+    reset_root_cmp_cache();
+  }
+
+  bool empty() const { return data_.empty(); }
+
+  size_t size() const { return data_.size(); }
+
+  void reset_root_cmp_cache() { root_cmp_cache_ = port::kMaxSizet; }
+
+ private:
+  static inline size_t get_root() { return 0; }
+  static inline size_t get_parent(size_t index) { return (index - 1) / 2; }
+  static inline size_t get_left(size_t index) { return 2 * index + 1; }
+  static inline size_t get_right(size_t index) { return 2 * index + 2; }
+
+  void upheap(size_t index) {
+    T v = std::move(data_[index]);
+    while (index > get_root()) {
+      const size_t parent = get_parent(index);
+      if (!cmp_(data_[parent], v)) {
+        break;
+      }
+      data_[index] = std::move(data_[parent]);
+      index = parent;
+    }
+    data_[index] = std::move(v);
+    reset_root_cmp_cache();
+  }
+
+  void downheap(size_t index) {
+    T v = std::move(data_[index]);
+
+    size_t picked_child = port::kMaxSizet;
+    while (1) {
+      const size_t left_child = get_left(index);
+      if (get_left(index) >= data_.size()) {
+        break;
+      }
+      const size_t right_child = left_child + 1;
+      assert(right_child == get_right(index));
+      picked_child = left_child;
+      if (index == 0 && root_cmp_cache_ < data_.size()) {
+        picked_child = root_cmp_cache_;
+      } else if (right_child < data_.size() &&
+                 cmp_(data_[left_child], data_[right_child])) {
+        picked_child = right_child;
+      }
+      if (!cmp_(v, data_[picked_child])) {
+        break;
+      }
+      data_[index] = std::move(data_[picked_child]);
+      index = picked_child;
+    }
+
+    if (index == 0) {
+      // We did not change anything in the tree except for the value
+      // of the root node, left and right child did not change, we can
+      // cache that `picked_child` is the smallest child
+      // so next time we compare againist it directly
+      root_cmp_cache_ = picked_child;
+    } else {
+      // the tree changed, reset cache
+      reset_root_cmp_cache();
+    }
+
+    data_[index] = std::move(v);
+  }
+
+  Compare cmp_;
+  autovector<T> data_;
+  // Used to reduce number of cmp_ calls in downheap()
+  size_t root_cmp_cache_ = port::kMaxSizet;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/heap_test.cc b/src/rocksdb/util/heap_test.cc
new file mode 100644
index 00000000..d036a62e
--- /dev/null
+++ b/src/rocksdb/util/heap_test.cc
@@ -0,0 +1,139 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <gtest/gtest.h>
+
+#include <climits>
+
+#include <queue>
+#include <random>
+#include <utility>
+
+#include "util/heap.h"
+
+#ifndef GFLAGS
+const int64_t FLAGS_iters = 100000;
+#else
+#include "util/gflags_compat.h"
+DEFINE_int64(iters, 100000, "number of pseudo-random operations in each test");
+#endif  // GFLAGS
+
+/*
+ * Compares the custom heap implementation in util/heap.h against
+ * std::priority_queue on a pseudo-random sequence of operations.
+ */
+
+namespace rocksdb {
+
+using HeapTestValue = uint64_t;
+using Params = std::tuple<size_t, HeapTestValue, int64_t>;
+
+class HeapTest : public ::testing::TestWithParam<Params> {
+};
+
+TEST_P(HeapTest, Test) {
+  // This test performs the same pseudorandom sequence of operations on a
+  // BinaryHeap and an std::priority_queue, comparing output.  The three
+  // possible operations are insert, replace top and pop.
+  //
+  // Insert is chosen slightly more often than the others so that the size of
+  // the heap slowly grows.  Once the size heats the MAX_HEAP_SIZE limit, we
+  // disallow inserting until the heap becomes empty, testing the "draining"
+  // scenario.
+
+  const auto MAX_HEAP_SIZE = std::get<0>(GetParam());
+  const auto MAX_VALUE = std::get<1>(GetParam());
+  const auto RNG_SEED = std::get<2>(GetParam());
+
+  BinaryHeap<HeapTestValue> heap;
+  std::priority_queue<HeapTestValue> ref;
+
+  std::mt19937 rng(static_cast<unsigned int>(RNG_SEED));
+  std::uniform_int_distribution<HeapTestValue> value_dist(0, MAX_VALUE);
+  int ndrains = 0;
+  bool draining = false;     // hit max size, draining until we empty the heap
+  size_t size = 0;
+  for (int64_t i = 0; i < FLAGS_iters; ++i) {
+    if (size == 0) {
+      draining = false;
+    }
+
+    if (!draining &&
+        (size == 0 || std::bernoulli_distribution(0.4)(rng))) {
+      // insert
+      HeapTestValue val = value_dist(rng);
+      heap.push(val);
+      ref.push(val);
+      ++size;
+      if (size == MAX_HEAP_SIZE) {
+        draining = true;
+        ++ndrains;
+      }
+    } else if (std::bernoulli_distribution(0.5)(rng)) {
+      // replace top
+      HeapTestValue val = value_dist(rng);
+      heap.replace_top(val);
+      ref.pop();
+      ref.push(val);
+    } else {
+      // pop
+      assert(size > 0);
+      heap.pop();
+      ref.pop();
+      --size;
+    }
+
+    // After every operation, check that the public methods give the same
+    // results
+    assert((size == 0) == ref.empty());
+    ASSERT_EQ(size == 0, heap.empty());
+    if (size > 0) {
+      ASSERT_EQ(ref.top(), heap.top());
+    }
+  }
+
+  // Probabilities should be set up to occasionally hit the max heap size and
+  // drain it
+  assert(ndrains > 0);
+
+  heap.clear();
+  ASSERT_TRUE(heap.empty());
+}
+
+// Basic test, MAX_VALUE = 3*MAX_HEAP_SIZE (occasional duplicates)
+INSTANTIATE_TEST_CASE_P(
+  Basic, HeapTest,
+  ::testing::Values(Params(1000, 3000, 0x1b575cf05b708945))
+);
+// Mid-size heap with small values (many duplicates)
+INSTANTIATE_TEST_CASE_P(
+  SmallValues, HeapTest,
+  ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0))
+);
+// Small heap, large value range (no duplicates)
+INSTANTIATE_TEST_CASE_P(
+  SmallHeap, HeapTest,
+  ::testing::Values(Params(10, ULLONG_MAX, 0x3e1fa8f4d01707cf))
+);
+// Two-element heap
+INSTANTIATE_TEST_CASE_P(
+  TwoElementHeap, HeapTest,
+  ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc))
+);
+// One-element heap
+INSTANTIATE_TEST_CASE_P(
+  OneElementHeap, HeapTest,
+  ::testing::Values(Params(1, 3, 0x176a1019ab0b612e))
+);
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/jemalloc_nodump_allocator.cc b/src/rocksdb/util/jemalloc_nodump_allocator.cc
new file mode 100644
index 00000000..cdd08e93
--- /dev/null
+++ b/src/rocksdb/util/jemalloc_nodump_allocator.cc
@@ -0,0 +1,206 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/jemalloc_nodump_allocator.h"
+
+#include <string>
+#include <thread>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
+
+JemallocNodumpAllocator::JemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index)
+    : options_(options),
+      arena_hooks_(std::move(arena_hooks)),
+      arena_index_(arena_index),
+      tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {}
+
+int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
+  // We always enable tcache. The only corner case is when there are a ton of
+  // threads accessing with low frequency, then it could consume a lot of
+  // memory (may reach # threads * ~1MB) without bringing too much benefit.
+  if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
+                                     size > options_.tcache_size_upper_bound)) {
+    return MALLOCX_TCACHE_NONE;
+  }
+  unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
+  if (UNLIKELY(tcache_index == nullptr)) {
+    // Instantiate tcache.
+    tcache_index = new unsigned(0);
+    size_t tcache_index_size = sizeof(unsigned);
+    int ret =
+        mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0);
+    if (ret != 0) {
+      // No good way to expose the error. Silently disable tcache.
+      delete tcache_index;
+      return MALLOCX_TCACHE_NONE;
+    }
+    tcache_.Reset(static_cast<void*>(tcache_index));
+  }
+  return MALLOCX_TCACHE(*tcache_index);
+}
+
+void* JemallocNodumpAllocator::Allocate(size_t size) {
+  int tcache_flag = GetThreadSpecificCache(size);
+  return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
+}
+
+void JemallocNodumpAllocator::Deallocate(void* p) {
+  // Obtain tcache.
+  size_t size = 0;
+  if (options_.limit_tcache_size) {
+    size = malloc_usable_size(p);
+  }
+  int tcache_flag = GetThreadSpecificCache(size);
+  // No need to pass arena index to dallocx(). Jemalloc will find arena index
+  // from its own metadata.
+  dallocx(p, tcache_flag);
+}
+
+void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
+                                     size_t size, size_t alignment, bool* zero,
+                                     bool* commit, unsigned arena_ind) {
+  extent_alloc_t* original_alloc =
+      original_alloc_.load(std::memory_order_relaxed);
+  assert(original_alloc != nullptr);
+  void* result = original_alloc(extent, new_addr, size, alignment, zero, commit,
+                                arena_ind);
+  if (result != nullptr) {
+    int ret = madvise(result, size, MADV_DONTDUMP);
+    if (ret != 0) {
+      fprintf(
+          stderr,
+          "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d",
+          ret);
+      assert(false);
+    }
+  }
+  return result;
+}
+
+Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
+  assert(arena_index != 0);
+  std::string key = "arena." + ToString(arena_index) + ".destroy";
+  int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to destroy jemalloc arena, error code: " +
+                              ToString(ret));
+  }
+  return Status::OK();
+}
+
+void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) {
+  assert(ptr != nullptr);
+  unsigned* tcache_index = static_cast<unsigned*>(ptr);
+  size_t tcache_index_size = sizeof(unsigned);
+  int ret __attribute__((__unused__)) =
+      mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size);
+  // Silently ignore error.
+  assert(ret == 0);
+  delete tcache_index;
+}
+
+JemallocNodumpAllocator::~JemallocNodumpAllocator() {
+  // Destroy tcache before destroying arena.
+  autovector<void*> tcache_list;
+  tcache_.Scrape(&tcache_list, nullptr);
+  for (void* tcache_index : tcache_list) {
+    DestroyThreadSpecificCache(tcache_index);
+  }
+  // Destroy arena. Silently ignore error.
+  Status s __attribute__((__unused__)) = DestroyArena(arena_index_);
+  assert(s.ok());
+}
+
+size_t JemallocNodumpAllocator::UsableSize(void* p,
+                                           size_t /*allocation_size*/) const {
+  return malloc_usable_size(static_cast<void*>(p));
+}
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator) {
+  *memory_allocator = nullptr;
+  Status unsupported = Status::NotSupported(
+      "JemallocNodumpAllocator only available with jemalloc version >= 5 "
+      "and MADV_DONTDUMP is available.");
+#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  (void)options;
+  return unsupported;
+#else
+  if (!HasJemalloc()) {
+    return unsupported;
+  }
+  if (memory_allocator == nullptr) {
+    return Status::InvalidArgument("memory_allocator must be non-null.");
+  }
+  if (options.limit_tcache_size &&
+      options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
+    return Status::InvalidArgument(
+        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+  }
+
+  // Create arena.
+  unsigned arena_index = 0;
+  size_t arena_index_size = sizeof(arena_index);
+  int ret =
+      mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to create jemalloc arena, error code: " +
+                              ToString(ret));
+  }
+  assert(arena_index != 0);
+
+  // Read existing hooks.
+  std::string key = "arena." + ToString(arena_index) + ".extent_hooks";
+  extent_hooks_t* hooks;
+  size_t hooks_size = sizeof(hooks);
+  ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
+  if (ret != 0) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Failed to read existing hooks, error code: " +
+                              ToString(ret));
+  }
+
+  // Store existing alloc.
+  extent_alloc_t* original_alloc = hooks->alloc;
+  extent_alloc_t* expected = nullptr;
+  bool success =
+      JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
+          expected, original_alloc);
+  if (!success && original_alloc != expected) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Original alloc conflict.");
+  }
+
+  // Set the custom hook.
+  std::unique_ptr<extent_hooks_t> new_hooks(new extent_hooks_t(*hooks));
+  new_hooks->alloc = &JemallocNodumpAllocator::Alloc;
+  extent_hooks_t* hooks_ptr = new_hooks.get();
+  ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
+  if (ret != 0) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Failed to set custom hook, error code: " +
+                              ToString(ret));
+  }
+
+  // Create cache allocator.
+  memory_allocator->reset(
+      new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index));
+  return Status::OK();
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/jemalloc_nodump_allocator.h b/src/rocksdb/util/jemalloc_nodump_allocator.h
new file mode 100644
index 00000000..e93c1223
--- /dev/null
+++ b/src/rocksdb/util/jemalloc_nodump_allocator.h
@@ -0,0 +1,79 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <vector>
+
+#include "port/jemalloc_helper.h"
+#include "port/port.h"
+#include "rocksdb/memory_allocator.h"
+#include "util/core_local.h"
+#include "util/thread_local.h"
+
+#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
+
+#include <sys/mman.h>
+
+#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
+#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+namespace rocksdb {
+
+class JemallocNodumpAllocator : public MemoryAllocator {
+ public:
+  JemallocNodumpAllocator(JemallocAllocatorOptions& options,
+                          std::unique_ptr<extent_hooks_t>&& arena_hooks,
+                          unsigned arena_index);
+  ~JemallocNodumpAllocator();
+
+  const char* Name() const override { return "JemallocNodumpAllocator"; }
+  void* Allocate(size_t size) override;
+  void Deallocate(void* p) override;
+  size_t UsableSize(void* p, size_t allocation_size) const override;
+
+ private:
+  friend Status NewJemallocNodumpAllocator(
+      JemallocAllocatorOptions& options,
+      std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+  // Custom alloc hook to replace jemalloc default alloc.
+  static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
+                     size_t alignment, bool* zero, bool* commit,
+                     unsigned arena_ind);
+
+  // Destroy arena on destruction of the allocator, or on failure.
+  static Status DestroyArena(unsigned arena_index);
+
+  // Destroy tcache on destruction of the allocator, or thread exit.
+  static void DestroyThreadSpecificCache(void* ptr);
+
+  // Get or create tcache. Return flag suitable to use with `mallocx`:
+  // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
+  int GetThreadSpecificCache(size_t size);
+
+  // A function pointer to jemalloc default alloc. Use atomic to make sure
+  // NewJemallocNodumpAllocator is thread-safe.
+  //
+  // Hack: original_alloc_ needs to be static for Alloc() to access it.
+  // alloc needs to be static to pass to jemalloc as function pointer.
+  static std::atomic<extent_alloc_t*> original_alloc_;
+
+  const JemallocAllocatorOptions options_;
+
+  // Custom hooks has to outlive corresponding arena.
+  const std::unique_ptr<extent_hooks_t> arena_hooks_;
+
+  // Arena index.
+  const unsigned arena_index_;
+
+  // Hold thread-local tcache index.
+  ThreadLocalPtr tcache_;
+};
+
+}  // namespace rocksdb
+#endif  // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
+#endif  // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
diff --git a/src/rocksdb/util/kv_map.h b/src/rocksdb/util/kv_map.h
new file mode 100644
index 00000000..d5ba3307
--- /dev/null
+++ b/src/rocksdb/util/kv_map.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+namespace stl_wrappers {
+
+struct LessOfComparator {
+  explicit LessOfComparator(const Comparator* c = BytewiseComparator())
+      : cmp(c) {}
+
+  bool operator()(const std::string& a, const std::string& b) const {
+    return cmp->Compare(Slice(a), Slice(b)) < 0;
+  }
+  bool operator()(const Slice& a, const Slice& b) const {
+    return cmp->Compare(a, b) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+typedef std::map<std::string, std::string, LessOfComparator> KVMap;
+}
+}
diff --git a/src/rocksdb/util/log_buffer.cc b/src/rocksdb/util/log_buffer.cc
new file mode 100644
index 00000000..d09e0cb0
--- /dev/null
+++ b/src/rocksdb/util/log_buffer.cc
@@ -0,0 +1,93 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/log_buffer.h"
+
+#include "port/sys_time.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+LogBuffer::LogBuffer(const InfoLogLevel log_level,
+                     Logger*info_log)
+    : log_level_(log_level), info_log_(info_log) {}
+
+void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format,
+                               va_list ap) {
+  if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) {
+    // Skip the level because of its level.
+    return;
+  }
+
+  char* alloc_mem = arena_.AllocateAligned(max_log_size);
+  BufferedLog* buffered_log = new (alloc_mem) BufferedLog();
+  char* p = buffered_log->message;
+  char* limit = alloc_mem + max_log_size - 1;
+
+  // store the time
+  gettimeofday(&(buffered_log->now_tv), nullptr);
+
+  // Print the message
+  if (p < limit) {
+    va_list backup_ap;
+    va_copy(backup_ap, ap);
+    auto n = vsnprintf(p, limit - p, format, backup_ap);
+#ifndef OS_WIN
+    // MS reports -1 when the buffer is too short
+    assert(n >= 0);
+#endif
+    if (n > 0) {
+      p += n;
+    } else {
+      p = limit;
+    }
+    va_end(backup_ap);
+  }
+
+  if (p > limit) {
+    p = limit;
+  }
+
+  // Add '\0' to the end
+  *p = '\0';
+
+  logs_.push_back(buffered_log);
+}
+
+void LogBuffer::FlushBufferToLog() {
+  for (BufferedLog* log : logs_) {
+    const time_t seconds = log->now_tv.tv_sec;
+    struct tm t;
+    if (localtime_r(&seconds, &t) != nullptr) {
+      Log(log_level_, info_log_,
+          "(Original Log Time %04d/%02d/%02d-%02d:%02d:%02d.%06d) %s",
+          t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min,
+          t.tm_sec, static_cast<int>(log->now_tv.tv_usec), log->message);
+    }
+  }
+  logs_.clear();
+}
+
+void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format,
+                 ...) {
+  if (log_buffer != nullptr) {
+    va_list ap;
+    va_start(ap, format);
+    log_buffer->AddLogToBuffer(max_log_size, format, ap);
+    va_end(ap);
+  }
+}
+
+void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) {
+  const size_t kDefaultMaxLogSize = 512;
+  if (log_buffer != nullptr) {
+    va_list ap;
+    va_start(ap, format);
+    log_buffer->AddLogToBuffer(kDefaultMaxLogSize, format, ap);
+    va_end(ap);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/log_buffer.h b/src/rocksdb/util/log_buffer.h
new file mode 100644
index 00000000..e356b93a
--- /dev/null
+++ b/src/rocksdb/util/log_buffer.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "util/arena.h"
+#include "util/autovector.h"
+#include "port/sys_time.h"
+#include <ctime>
+
+namespace rocksdb {
+
+class Logger;
+
+// A class to buffer info log entries and flush them in the end.
+class LogBuffer {
+ public:
+  // log_level: the log level for all the logs
+  // info_log:  logger to write the logs to
+  LogBuffer(const InfoLogLevel log_level, Logger* info_log);
+
+  // Add a log entry to the buffer. Use default max_log_size.
+  // max_log_size indicates maximize log size, including some metadata.
+  void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap);
+
+  size_t IsEmpty() const { return logs_.empty(); }
+
+  // Flush all buffered log to the info log.
+  void FlushBufferToLog();
+
+ private:
+  // One log entry with its timestamp
+  struct BufferedLog {
+    struct timeval now_tv;  // Timestamp of the log
+    char message[1];        // Beginning of log message
+  };
+
+  const InfoLogLevel log_level_;
+  Logger* info_log_;
+  Arena arena_;
+  autovector<BufferedLog*> logs_;
+};
+
+// Add log to the LogBuffer for a delayed info logging. It can be used when
+// we want to add some logs inside a mutex.
+// max_log_size indicates maximize log size, including some metadata.
+extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size,
+                        const char* format, ...);
+// Same as previous function, but with default max log size.
+extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/log_write_bench.cc b/src/rocksdb/util/log_write_bench.cc
new file mode 100644
index 00000000..5c9b3e84
--- /dev/null
+++ b/src/rocksdb/util/log_write_bench.cc
@@ -0,0 +1,83 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include "monitoring/histogram.h"
+#include "rocksdb/env.h"
+#include "util/file_reader_writer.h"
+#include "util/gflags_compat.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+// A simple benchmark to simulate transactional logs
+
+DEFINE_int32(num_records, 6000, "Number of records.");
+DEFINE_int32(record_size, 249, "Size of each record.");
+DEFINE_int32(record_interval, 10000, "Interval between records (microSec)");
+DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions");
+DEFINE_bool(enable_sync, false, "sync after each write.");
+
+namespace rocksdb {
+void RunBenchmark() {
+  std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
+  Env* env = Env::Default();
+  EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions());
+  env_options.bytes_per_sync = FLAGS_bytes_per_sync;
+  std::unique_ptr<WritableFile> file;
+  env->NewWritableFile(file_name, &file, env_options);
+  std::unique_ptr<WritableFileWriter> writer;
+  writer.reset(new WritableFileWriter(std::move(file), env_options));
+
+  std::string record;
+  record.assign(FLAGS_record_size, 'X');
+
+  HistogramImpl hist;
+
+  uint64_t start_time = env->NowMicros();
+  for (int i = 0; i < FLAGS_num_records; i++) {
+    uint64_t start_nanos = env->NowNanos();
+    writer->Append(record);
+    writer->Flush();
+    if (FLAGS_enable_sync) {
+      writer->Sync(false);
+    }
+    hist.Add(env->NowNanos() - start_nanos);
+
+    if (i % 1000 == 1) {
+      fprintf(stderr, "Wrote %d records...\n", i);
+    }
+
+    int time_to_sleep =
+        (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time);
+    if (time_to_sleep > 0) {
+      env->SleepForMicroseconds(time_to_sleep);
+    }
+  }
+
+  fprintf(stderr, "Distribution of latency of append+flush: \n%s",
+          hist.ToString().c_str());
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  rocksdb::RunBenchmark();
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/logging.h b/src/rocksdb/util/logging.h
new file mode 100644
index 00000000..a4ef31bd
--- /dev/null
+++ b/src/rocksdb/util/logging.h
@@ -0,0 +1,61 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Must not be included from any .h files to avoid polluting the namespace
+// with macros.
+
+#pragma once
+
+// Helper macros that include information about file name and line number
+#define ROCKS_LOG_STRINGIFY(x) #x
+#define ROCKS_LOG_TOSTRING(x) ROCKS_LOG_STRINGIFY(x)
+#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT)
+
+inline const char* RocksLogShorterFileName(const char* file)
+{
+  // 15 is the length of "util/logging.h".
+  // If the name of this file changed, please change this number, too.
+  return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0);
+}
+
+// Don't inclide file/line info in HEADER level
+#define ROCKS_LOG_HEADER(LGR, FMT, ...)                                          \
+  rocksdb::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_DEBUG(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::DEBUG_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_INFO(LGR, FMT, ...)                                            \
+  rocksdb::Log(InfoLogLevel::INFO_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_WARN(LGR, FMT, ...)                                            \
+  rocksdb::Log(InfoLogLevel::WARN_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_ERROR(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::ERROR_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_FATAL(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::FATAL_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...)                                      \
+  rocksdb::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT),                \
+                       RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...)                 \
+  rocksdb::LogToBuffer(LOG_BUF, MAX_LOG_SIZE, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+                       RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_DETAILS(LGR, FMT, ...) \
+  ;  // due to overhead by default skip such lines
+// ROCKS_LOG_DEBUG(LGR, FMT, ##__VA_ARGS__)
diff --git a/src/rocksdb/util/memory_allocator.h b/src/rocksdb/util/memory_allocator.h
new file mode 100644
index 00000000..99a7241d
--- /dev/null
+++ b/src/rocksdb/util/memory_allocator.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "rocksdb/memory_allocator.h"
+
+namespace rocksdb {
+
+struct CustomDeleter {
+  CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
+
+  void operator()(char* ptr) const {
+    if (allocator) {
+      allocator->Deallocate(reinterpret_cast<void*>(ptr));
+    } else {
+      delete[] ptr;
+    }
+  }
+
+  MemoryAllocator* allocator;
+};
+
+using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
+
+inline CacheAllocationPtr AllocateBlock(size_t size,
+                                        MemoryAllocator* allocator) {
+  if (allocator) {
+    auto block = reinterpret_cast<char*>(allocator->Allocate(size));
+    return CacheAllocationPtr(block, allocator);
+  }
+  return CacheAllocationPtr(new char[size]);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/memory_usage.h b/src/rocksdb/util/memory_usage.h
new file mode 100644
index 00000000..0d885445
--- /dev/null
+++ b/src/rocksdb/util/memory_usage.h
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <unordered_map>
+
+namespace rocksdb {
+
+// Helper methods to estimate memroy usage by std containers.
+
+template <class Key, class Value, class Hash>
+size_t ApproximateMemoryUsage(
+    const std::unordered_map<Key, Value, Hash>& umap) {
+  typedef std::unordered_map<Key, Value, Hash> Map;
+  return sizeof(umap) +
+         // Size of all items plus a next pointer for each item.
+         (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() +
+         // Size of hash buckets.
+         umap.bucket_count() * sizeof(void*);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/mock_time_env.h b/src/rocksdb/util/mock_time_env.h
new file mode 100644
index 00000000..feada477
--- /dev/null
+++ b/src/rocksdb/util/mock_time_env.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class MockTimeEnv : public EnvWrapper {
+ public:
+  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
+
+  virtual Status GetCurrentTime(int64_t* time) override {
+    assert(time != nullptr);
+    assert(current_time_ <=
+           static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
+    *time = static_cast<int64_t>(current_time_);
+    return Status::OK();
+  }
+
+  virtual uint64_t NowMicros() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
+    return current_time_ * 1000000;
+  }
+
+  virtual uint64_t NowNanos() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
+    return current_time_ * 1000000000;
+  }
+
+  uint64_t RealNowMicros() { return target()->NowMicros(); }
+
+  void set_current_time(uint64_t time) {
+    assert(time >= current_time_);
+    current_time_ = time;
+  }
+
+ private:
+  std::atomic<uint64_t> current_time_{0};
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/murmurhash.cc b/src/rocksdb/util/murmurhash.cc
new file mode 100644
index 00000000..3b759c5e
--- /dev/null
+++ b/src/rocksdb/util/murmurhash.cc
@@ -0,0 +1,191 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash
+  is under the MIT license.
+*/
+#include "murmurhash.h"
+#include "util/util.h"
+
+#if defined(__x86_64__)
+
+// -------------------------------------------------------------------
+//
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+//
+// 64-bit hash for 64-bit platforms
+
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
+{
+    const uint64_t m = 0xc6a4a7935bd1e995;
+    const int r = 47;
+
+    uint64_t h = seed ^ (len * m);
+
+    const uint64_t * data = (const uint64_t *)key;
+    const uint64_t * end = data + (len/8);
+
+    while(data != end)
+    {
+        uint64_t k = *data++;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+    }
+
+    const unsigned char * data2 = (const unsigned char*)data;
+
+    switch(len & 7)
+    {
+    case 7: h ^= ((uint64_t)data2[6]) << 48; FALLTHROUGH_INTENDED;
+    case 6: h ^= ((uint64_t)data2[5]) << 40; FALLTHROUGH_INTENDED;
+    case 5: h ^= ((uint64_t)data2[4]) << 32; FALLTHROUGH_INTENDED;
+    case 4: h ^= ((uint64_t)data2[3]) << 24; FALLTHROUGH_INTENDED;
+    case 3: h ^= ((uint64_t)data2[2]) << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= ((uint64_t)data2[1]) << 8;  FALLTHROUGH_INTENDED;
+    case 1: h ^= ((uint64_t)data2[0]);
+        h *= m;
+    };
+
+    h ^= h >> r;
+    h *= m;
+    h ^= h >> r;
+
+    return h;
+}
+
+#elif defined(__i386__)
+
+// -------------------------------------------------------------------
+//
+// Note - This code makes a few assumptions about how your machine behaves -
+//
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+//
+// And it has a few limitations -
+//
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+//    machines.
+
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
+{
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    // Initialize the hash to a 'random' value
+
+    unsigned int h = seed ^ len;
+
+    // Mix 4 bytes at a time into the hash
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k = *(unsigned int *)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    // Handle the last few bytes of the input array
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= data[1] << 8;  FALLTHROUGH_INTENDED;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    // Do a few final mixes of the hash to ensure the last few
+    // bytes are well-incorporated.
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#else
+
+// -------------------------------------------------------------------
+//
+// Same as MurmurHash2, but endian- and alignment-neutral.
+// Half the speed though, alas.
+
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed )
+{
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    unsigned int h = seed ^ len;
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k;
+
+        k  = data[0];
+        k |= data[1] << 8;
+        k |= data[2] << 16;
+        k |= data[3] << 24;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= data[1] << 8;  FALLTHROUGH_INTENDED;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+
+#endif
diff --git a/src/rocksdb/util/murmurhash.h b/src/rocksdb/util/murmurhash.h
new file mode 100644
index 00000000..cbfc4068
--- /dev/null
+++ b/src/rocksdb/util/murmurhash.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash
+  is under the MIT license.
+*/
+#pragma once
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+#if defined(__x86_64__)
+#define MURMUR_HASH MurmurHash64A
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash64A
+typedef uint64_t murmur_t;
+
+#elif defined(__i386__)
+#define MURMUR_HASH MurmurHash2
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHash2
+typedef unsigned int murmur_t;
+
+#else
+#define MURMUR_HASH MurmurHashNeutral2
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed );
+#define MurmurHash MurmurHashNeutral2
+typedef unsigned int murmur_t;
+#endif
+
+// Allow slice to be hashable by murmur hash.
+namespace rocksdb {
+struct murmur_hash {
+  size_t operator()(const Slice& slice) const {
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0);
+  }
+};
+}  // rocksdb
diff --git a/src/rocksdb/util/mutexlock.h b/src/rocksdb/util/mutexlock.h
new file mode 100644
index 00000000..640cef3d
--- /dev/null
+++ b/src/rocksdb/util/mutexlock.h
@@ -0,0 +1,131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <assert.h>
+#include <atomic>
+#include <mutex>
+#include <thread>
+#include "port/port.h"
+
+namespace rocksdb {
+
+// Helper class that locks a mutex on construction and unlocks the mutex when
+// the destructor of the MutexLock object is invoked.
+//
+// Typical usage:
+//
+//   void MyClass::MyMethod() {
+//     MutexLock l(&mu_);       // mu_ is an instance variable
+//     ... some complex code, possibly with multiple return paths ...
+//   }
+
+class MutexLock {
+ public:
+  explicit MutexLock(port::Mutex *mu) : mu_(mu) {
+    this->mu_->Lock();
+  }
+  ~MutexLock() { this->mu_->Unlock(); }
+
+ private:
+  port::Mutex *const mu_;
+  // No copying allowed
+  MutexLock(const MutexLock&);
+  void operator=(const MutexLock&);
+};
+
+//
+// Acquire a ReadLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class ReadLock {
+ public:
+  explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->ReadLock();
+  }
+  ~ReadLock() { this->mu_->ReadUnlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  ReadLock(const ReadLock&);
+  void operator=(const ReadLock&);
+};
+
+//
+// Automatically unlock a locked mutex when the object is destroyed
+//
+class ReadUnlock {
+ public:
+  explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); }
+  ~ReadUnlock() { mu_->ReadUnlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  ReadUnlock(const ReadUnlock &) = delete;
+  ReadUnlock &operator=(const ReadUnlock &) = delete;
+};
+
+//
+// Acquire a WriteLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class WriteLock {
+ public:
+  explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
+    this->mu_->WriteLock();
+  }
+  ~WriteLock() { this->mu_->WriteUnlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+  // No copying allowed
+  WriteLock(const WriteLock&);
+  void operator=(const WriteLock&);
+};
+
+//
+// SpinMutex has very low overhead for low-contention cases.  Method names
+// are chosen so you can use std::unique_lock or std::lock_guard with it.
+//
+class SpinMutex {
+ public:
+  SpinMutex() : locked_(false) {}
+
+  bool try_lock() {
+    auto currently_locked = locked_.load(std::memory_order_relaxed);
+    return !currently_locked &&
+           locked_.compare_exchange_weak(currently_locked, true,
+                                         std::memory_order_acquire,
+                                         std::memory_order_relaxed);
+  }
+
+  void lock() {
+    for (size_t tries = 0;; ++tries) {
+      if (try_lock()) {
+        // success
+        break;
+      }
+      port::AsmVolatilePause();
+      if (tries > 100) {
+        std::this_thread::yield();
+      }
+    }
+  }
+
+  void unlock() { locked_.store(false, std::memory_order_release); }
+
+ private:
+  std::atomic<bool> locked_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/ppc-opcode.h b/src/rocksdb/util/ppc-opcode.h
new file mode 100644
index 00000000..554fa50a
--- /dev/null
+++ b/src/rocksdb/util/ppc-opcode.h
@@ -0,0 +1,28 @@
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#pragma once
+
+#define __PPC_RA(a) (((a)&0x1f) << 16)
+#define __PPC_RB(b) (((b)&0x1f) << 11)
+#define __PPC_XA(a) ((((a)&0x1f) << 16) | (((a)&0x20) >> 3))
+#define __PPC_XB(b) ((((b)&0x1f) << 11) | (((b)&0x20) >> 4))
+#define __PPC_XS(s) ((((s)&0x1f) << 21) | (((s)&0x20) >> 5))
+#define __PPC_XT(s) __PPC_XS(s)
+#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW 0x10000488
+#define PPC_INST_VPMSUMD 0x100004c8
+#define PPC_INST_MFVSRD 0x7c000066
+#define PPC_INST_MTVSRD 0x7c000166
+
+#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0)
+#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0)
diff --git a/src/rocksdb/util/random.cc b/src/rocksdb/util/random.cc
new file mode 100644
index 00000000..5e2cf626
--- /dev/null
+++ b/src/rocksdb/util/random.cc
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "util/random.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <thread>
+#include <utility>
+
+#include "port/likely.h"
+#include "util/thread_local.h"
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+#define STORAGE_DECL static __thread
+#else
+#define STORAGE_DECL static
+#endif
+
+namespace rocksdb {
+
+Random* Random::GetTLSInstance() {
+  STORAGE_DECL Random* tls_instance;
+  STORAGE_DECL std::aligned_storage<sizeof(Random)>::type tls_instance_bytes;
+
+  auto rv = tls_instance;
+  if (UNLIKELY(rv == nullptr)) {
+    size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    rv = new (&tls_instance_bytes) Random((uint32_t)seed);
+    tls_instance = rv;
+  }
+  return rv;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/random.h b/src/rocksdb/util/random.h
new file mode 100644
index 00000000..2a5fcbc6
--- /dev/null
+++ b/src/rocksdb/util/random.h
@@ -0,0 +1,109 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <random>
+#include <stdint.h>
+
+namespace rocksdb {
+
+// A very simple random number generator.  Not especially good at
+// generating truly random bits, but good enough for our needs in this
+// package.
+class Random {
+ private:
+  enum : uint32_t {
+    M = 2147483647L  // 2^31-1
+  };
+  enum : uint64_t {
+    A = 16807  // bits 14, 8, 7, 5, 2, 1, 0
+  };
+
+  uint32_t seed_;
+
+  static uint32_t GoodSeed(uint32_t s) { return (s & M) != 0 ? (s & M) : 1; }
+
+ public:
+  // This is the largest value that can be returned from Next()
+  enum : uint32_t { kMaxNext = M };
+
+  explicit Random(uint32_t s) : seed_(GoodSeed(s)) {}
+
+  void Reset(uint32_t s) { seed_ = GoodSeed(s); }
+
+  uint32_t Next() {
+    // We are computing
+    //       seed_ = (seed_ * A) % M,    where M = 2^31-1
+    //
+    // seed_ must not be zero or M, or else all subsequent computed values
+    // will be zero or M respectively.  For all other values, seed_ will end
+    // up cycling through every number in [1,M-1]
+    uint64_t product = seed_ * A;
+
+    // Compute (product % M) using the fact that ((x << 31) % M) == x.
+    seed_ = static_cast<uint32_t>((product >> 31) + (product & M));
+    // The first reduction may overflow by 1 bit, so we may need to
+    // repeat.  mod == M is not possible; using > allows the faster
+    // sign-bit-based test.
+    if (seed_ > M) {
+      seed_ -= M;
+    }
+    return seed_;
+  }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint32_t Uniform(int n) { return Next() % n; }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(int n) { return (Next() % n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint32_t Skewed(int max_log) {
+    return Uniform(1 << Uniform(max_log + 1));
+  }
+
+  // Returns a Random instance for use by the current thread without
+  // additional locking
+  static Random* GetTLSInstance();
+};
+
+// A simple 64bit random number generator based on std::mt19937_64
+class Random64 {
+ private:
+  std::mt19937_64 generator_;
+
+ public:
+  explicit Random64(uint64_t s) : generator_(s) { }
+
+  // Generates the next random number
+  uint64_t Next() { return generator_(); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint64_t Uniform(uint64_t n) {
+    return std::uniform_int_distribution<uint64_t>(0, n - 1)(generator_);
+  }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(uint64_t n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint64_t Skewed(int max_log) {
+    return Uniform(uint64_t(1) << Uniform(max_log + 1));
+  }
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/rate_limiter.cc b/src/rocksdb/util/rate_limiter.cc
new file mode 100644
index 00000000..9d23c38f
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.cc
@@ -0,0 +1,339 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/rate_limiter.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/aligned_buffer.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+size_t RateLimiter::RequestToken(size_t bytes, size_t alignment,
+                                 Env::IOPriority io_priority, Statistics* stats,
+                                 RateLimiter::OpType op_type) {
+  if (io_priority < Env::IO_TOTAL && IsRateLimited(op_type)) {
+    bytes = std::min(bytes, static_cast<size_t>(GetSingleBurstBytes()));
+
+    if (alignment > 0) {
+      // Here we may actually require more than burst and block
+      // but we can not write less than one page at a time on direct I/O
+      // thus we may want not to use ratelimiter
+      bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
+    }
+    Request(bytes, io_priority, stats, op_type);
+  }
+  return bytes;
+}
+
+// Pending request
+struct GenericRateLimiter::Req {
+  explicit Req(int64_t _bytes, port::Mutex* _mu)
+      : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {}
+  int64_t request_bytes;
+  int64_t bytes;
+  port::CondVar cv;
+  bool granted;
+};
+
+GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
+                                       int64_t refill_period_us,
+                                       int32_t fairness, RateLimiter::Mode mode,
+                                       Env* env, bool auto_tuned)
+    : RateLimiter(mode),
+      refill_period_us_(refill_period_us),
+      rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
+                                     : rate_bytes_per_sec),
+      refill_bytes_per_period_(
+          CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)),
+      env_(env),
+      stop_(false),
+      exit_cv_(&request_mutex_),
+      requests_to_wait_(0),
+      available_bytes_(0),
+      next_refill_us_(NowMicrosMonotonic(env_)),
+      fairness_(fairness > 100 ? 100 : fairness),
+      rnd_((uint32_t)time(nullptr)),
+      leader_(nullptr),
+      auto_tuned_(auto_tuned),
+      num_drains_(0),
+      prev_num_drains_(0),
+      max_bytes_per_sec_(rate_bytes_per_sec),
+      tuned_time_(NowMicrosMonotonic(env_)) {
+  total_requests_[0] = 0;
+  total_requests_[1] = 0;
+  total_bytes_through_[0] = 0;
+  total_bytes_through_[1] = 0;
+}
+
+GenericRateLimiter::~GenericRateLimiter() {
+  MutexLock g(&request_mutex_);
+  stop_ = true;
+  requests_to_wait_ = static_cast<int32_t>(queue_[Env::IO_LOW].size() +
+                                           queue_[Env::IO_HIGH].size());
+  for (auto& r : queue_[Env::IO_HIGH]) {
+    r->cv.Signal();
+  }
+  for (auto& r : queue_[Env::IO_LOW]) {
+    r->cv.Signal();
+  }
+  while (requests_to_wait_ > 0) {
+    exit_cv_.Wait();
+  }
+}
+
+// This API allows user to dynamically change rate limiter's bytes per second.
+void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) {
+  assert(bytes_per_second > 0);
+  rate_bytes_per_sec_ = bytes_per_second;
+  refill_bytes_per_period_.store(
+      CalculateRefillBytesPerPeriod(bytes_per_second),
+      std::memory_order_relaxed);
+}
+
+void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
+                                 Statistics* stats) {
+  assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed));
+  TEST_SYNC_POINT("GenericRateLimiter::Request");
+  TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1",
+                           &rate_bytes_per_sec_);
+  MutexLock g(&request_mutex_);
+
+  if (auto_tuned_) {
+    static const int kRefillsPerTune = 100;
+    std::chrono::microseconds now(NowMicrosMonotonic(env_));
+    if (now - tuned_time_ >=
+        kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) {
+      Tune();
+    }
+  }
+
+  if (stop_) {
+    return;
+  }
+
+  ++total_requests_[pri];
+
+  if (available_bytes_ >= bytes) {
+    // Refill thread assigns quota and notifies requests waiting on
+    // the queue under mutex. So if we get here, that means nobody
+    // is waiting?
+    available_bytes_ -= bytes;
+    total_bytes_through_[pri] += bytes;
+    return;
+  }
+
+  // Request cannot be satisfied at this moment, enqueue
+  Req r(bytes, &request_mutex_);
+  queue_[pri].push_back(&r);
+
+  do {
+    bool timedout = false;
+    // Leader election, candidates can be:
+    // (1) a new incoming request,
+    // (2) a previous leader, whose quota has not been not assigned yet due
+    //     to lower priority
+    // (3) a previous waiter at the front of queue, who got notified by
+    //     previous leader
+    if (leader_ == nullptr &&
+        ((!queue_[Env::IO_HIGH].empty() &&
+            &r == queue_[Env::IO_HIGH].front()) ||
+         (!queue_[Env::IO_LOW].empty() &&
+            &r == queue_[Env::IO_LOW].front()))) {
+      leader_ = &r;
+      int64_t delta = next_refill_us_ - NowMicrosMonotonic(env_);
+      delta = delta > 0 ? delta : 0;
+      if (delta == 0) {
+        timedout = true;
+      } else {
+        int64_t wait_until = env_->NowMicros() + delta;
+        RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
+        ++num_drains_;
+        timedout = r.cv.TimedWait(wait_until);
+      }
+    } else {
+      // Not at the front of queue or an leader has already been elected
+      r.cv.Wait();
+    }
+
+    // request_mutex_ is held from now on
+    if (stop_) {
+      --requests_to_wait_;
+      exit_cv_.Signal();
+      return;
+    }
+
+    // Make sure the waken up request is always the header of its queue
+    assert(r.granted ||
+           (!queue_[Env::IO_HIGH].empty() &&
+            &r == queue_[Env::IO_HIGH].front()) ||
+           (!queue_[Env::IO_LOW].empty() &&
+            &r == queue_[Env::IO_LOW].front()));
+    assert(leader_ == nullptr ||
+           (!queue_[Env::IO_HIGH].empty() &&
+            leader_ == queue_[Env::IO_HIGH].front()) ||
+           (!queue_[Env::IO_LOW].empty() &&
+            leader_ == queue_[Env::IO_LOW].front()));
+
+    if (leader_ == &r) {
+      // Waken up from TimedWait()
+      if (timedout) {
+        // Time to do refill!
+        Refill();
+
+        // Re-elect a new leader regardless. This is to simplify the
+        // election handling.
+        leader_ = nullptr;
+
+        // Notify the header of queue if current leader is going away
+        if (r.granted) {
+          // Current leader already got granted with quota. Notify header
+          // of waiting queue to participate next round of election.
+          assert((queue_[Env::IO_HIGH].empty() ||
+                    &r != queue_[Env::IO_HIGH].front()) &&
+                 (queue_[Env::IO_LOW].empty() ||
+                    &r != queue_[Env::IO_LOW].front()));
+          if (!queue_[Env::IO_HIGH].empty()) {
+            queue_[Env::IO_HIGH].front()->cv.Signal();
+          } else if (!queue_[Env::IO_LOW].empty()) {
+            queue_[Env::IO_LOW].front()->cv.Signal();
+          }
+          // Done
+          break;
+        }
+      } else {
+        // Spontaneous wake up, need to continue to wait
+        assert(!r.granted);
+        leader_ = nullptr;
+      }
+    } else {
+      // Waken up by previous leader:
+      // (1) if requested quota is granted, it is done.
+      // (2) if requested quota is not granted, this means current thread
+      // was picked as a new leader candidate (previous leader got quota).
+      // It needs to participate leader election because a new request may
+      // come in before this thread gets waken up. So it may actually need
+      // to do Wait() again.
+      assert(!timedout);
+    }
+  } while (!r.granted);
+}
+
+void GenericRateLimiter::Refill() {
+  TEST_SYNC_POINT("GenericRateLimiter::Refill");
+  next_refill_us_ = NowMicrosMonotonic(env_) + refill_period_us_;
+  // Carry over the left over quota from the last period
+  auto refill_bytes_per_period =
+      refill_bytes_per_period_.load(std::memory_order_relaxed);
+  if (available_bytes_ < refill_bytes_per_period) {
+    available_bytes_ += refill_bytes_per_period;
+  }
+
+  int use_low_pri_first = rnd_.OneIn(fairness_) ? 0 : 1;
+  for (int q = 0; q < 2; ++q) {
+    auto use_pri = (use_low_pri_first == q) ? Env::IO_LOW : Env::IO_HIGH;
+    auto* queue = &queue_[use_pri];
+    while (!queue->empty()) {
+      auto* next_req = queue->front();
+      if (available_bytes_ < next_req->request_bytes) {
+        // avoid starvation
+        next_req->request_bytes -= available_bytes_;
+        available_bytes_ = 0;
+        break;
+      }
+      available_bytes_ -= next_req->request_bytes;
+      next_req->request_bytes = 0;
+      total_bytes_through_[use_pri] += next_req->bytes;
+      queue->pop_front();
+
+      next_req->granted = true;
+      if (next_req != leader_) {
+        // Quota granted, signal the thread
+        next_req->cv.Signal();
+      }
+    }
+  }
+}
+
+int64_t GenericRateLimiter::CalculateRefillBytesPerPeriod(
+    int64_t rate_bytes_per_sec) {
+  if (port::kMaxInt64 / rate_bytes_per_sec < refill_period_us_) {
+    // Avoid unexpected result in the overflow case. The result now is still
+    // inaccurate but is a number that is large enough.
+    return port::kMaxInt64 / 1000000;
+  } else {
+    return std::max(kMinRefillBytesPerPeriod,
+                    rate_bytes_per_sec * refill_period_us_ / 1000000);
+  }
+}
+
+Status GenericRateLimiter::Tune() {
+  const int kLowWatermarkPct = 50;
+  const int kHighWatermarkPct = 90;
+  const int kAdjustFactorPct = 5;
+  // computed rate limit will be in
+  // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`.
+  const int kAllowedRangeFactor = 20;
+
+  std::chrono::microseconds prev_tuned_time = tuned_time_;
+  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_));
+
+  int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time +
+                               std::chrono::microseconds(refill_period_us_) -
+                               std::chrono::microseconds(1)) /
+                              std::chrono::microseconds(refill_period_us_);
+  // We tune every kRefillsPerTune intervals, so the overflow and division-by-
+  // zero conditions should never happen.
+  assert(num_drains_ - prev_num_drains_ <= port::kMaxInt64 / 100);
+  assert(elapsed_intervals > 0);
+  int64_t drained_pct =
+      (num_drains_ - prev_num_drains_) * 100 / elapsed_intervals;
+
+  int64_t prev_bytes_per_sec = GetBytesPerSecond();
+  int64_t new_bytes_per_sec;
+  if (drained_pct == 0) {
+    new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor;
+  } else if (drained_pct < kLowWatermarkPct) {
+    // sanitize to prevent overflow
+    int64_t sanitized_prev_bytes_per_sec =
+        std::min(prev_bytes_per_sec, port::kMaxInt64 / 100);
+    new_bytes_per_sec =
+        std::max(max_bytes_per_sec_ / kAllowedRangeFactor,
+                 sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct));
+  } else if (drained_pct > kHighWatermarkPct) {
+    // sanitize to prevent overflow
+    int64_t sanitized_prev_bytes_per_sec = std::min(
+        prev_bytes_per_sec, port::kMaxInt64 / (100 + kAdjustFactorPct));
+    new_bytes_per_sec =
+        std::min(max_bytes_per_sec_,
+                 sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100);
+  } else {
+    new_bytes_per_sec = prev_bytes_per_sec;
+  }
+  if (new_bytes_per_sec != prev_bytes_per_sec) {
+    SetBytesPerSecond(new_bytes_per_sec);
+  }
+  num_drains_ = prev_num_drains_;
+  return Status::OK();
+}
+
+RateLimiter* NewGenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */,
+    int32_t fairness /* = 10 */,
+    RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */,
+    bool auto_tuned /* = false */) {
+  assert(rate_bytes_per_sec > 0);
+  assert(refill_period_us > 0);
+  assert(fairness > 0);
+  return new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
+                                mode, Env::Default(), auto_tuned);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/rate_limiter.h b/src/rocksdb/util/rate_limiter.h
new file mode 100644
index 00000000..cb91f0ae
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.h
@@ -0,0 +1,113 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <deque>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+class GenericRateLimiter : public RateLimiter {
+ public:
+  GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
+                     int32_t fairness, RateLimiter::Mode mode, Env* env,
+                     bool auto_tuned);
+
+  virtual ~GenericRateLimiter();
+
+  // This API allows user to dynamically change rate limiter's bytes per second.
+  virtual void SetBytesPerSecond(int64_t bytes_per_second) override;
+
+  // Request for token to write bytes. If this request can not be satisfied,
+  // the call is blocked. Caller is responsible to make sure
+  // bytes <= GetSingleBurstBytes()
+  using RateLimiter::Request;
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+                       Statistics* stats) override;
+
+  virtual int64_t GetSingleBurstBytes() const override {
+    return refill_bytes_per_period_.load(std::memory_order_relaxed);
+  }
+
+  virtual int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      return total_bytes_through_[Env::IO_LOW] +
+             total_bytes_through_[Env::IO_HIGH];
+    }
+    return total_bytes_through_[pri];
+  }
+
+  virtual int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      return total_requests_[Env::IO_LOW] + total_requests_[Env::IO_HIGH];
+    }
+    return total_requests_[pri];
+  }
+
+  virtual int64_t GetBytesPerSecond() const override {
+    return rate_bytes_per_sec_;
+  }
+
+ private:
+  void Refill();
+  int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec);
+  Status Tune();
+
+  uint64_t NowMicrosMonotonic(Env* env) {
+    return env->NowNanos() / std::milli::den;
+  }
+
+  // This mutex guard all internal states
+  mutable port::Mutex request_mutex_;
+
+  const int64_t kMinRefillBytesPerPeriod = 100;
+
+  const int64_t refill_period_us_;
+
+  int64_t rate_bytes_per_sec_;
+  // This variable can be changed dynamically.
+  std::atomic<int64_t> refill_bytes_per_period_;
+  Env* const env_;
+
+  bool stop_;
+  port::CondVar exit_cv_;
+  int32_t requests_to_wait_;
+
+  int64_t total_requests_[Env::IO_TOTAL];
+  int64_t total_bytes_through_[Env::IO_TOTAL];
+  int64_t available_bytes_;
+  int64_t next_refill_us_;
+
+  int32_t fairness_;
+  Random rnd_;
+
+  struct Req;
+  Req* leader_;
+  std::deque<Req*> queue_[Env::IO_TOTAL];
+
+  bool auto_tuned_;
+  int64_t num_drains_;
+  int64_t prev_num_drains_;
+  const int64_t max_bytes_per_sec_;
+  std::chrono::microseconds tuned_time_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/rate_limiter_test.cc b/src/rocksdb/util/rate_limiter_test.cc
new file mode 100644
index 00000000..d3f3be3b
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter_test.cc
@@ -0,0 +1,239 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "util/rate_limiter.h"
+
+#include <inttypes.h>
+#include <chrono>
+#include <limits>
+
+#include "db/db_test_util.h"
+#include "rocksdb/env.h"
+#include "util/random.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+// TODO(yhchiang): the rate will not be accurate when we run test in parallel.
+class RateLimiterTest : public testing::Test {};
+
+TEST_F(RateLimiterTest, OverflowRate) {
+  GenericRateLimiter limiter(port::kMaxInt64, 1000, 10,
+                             RateLimiter::Mode::kWritesOnly, Env::Default(),
+                             false /* auto_tuned */);
+  ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll);
+}
+
+TEST_F(RateLimiterTest, StartStop) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(100, 100, 10));
+}
+
+TEST_F(RateLimiterTest, Modes) {
+  for (auto mode : {RateLimiter::Mode::kWritesOnly,
+                    RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) {
+    GenericRateLimiter limiter(
+        2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+        10 /* fairness */, mode, Env::Default(), false /* auto_tuned */);
+    limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+                    RateLimiter::OpType::kRead);
+    if (mode == RateLimiter::Mode::kWritesOnly) {
+      ASSERT_EQ(0, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    } else {
+      ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    }
+
+    limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+                    RateLimiter::OpType::kWrite);
+    if (mode == RateLimiter::Mode::kAllIo) {
+      ASSERT_EQ(2000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    } else {
+      ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    }
+  }
+}
+
+#if !(defined(TRAVIS) && defined(OS_MACOSX))
+TEST_F(RateLimiterTest, Rate) {
+  auto* env = Env::Default();
+  struct Arg {
+    Arg(int32_t _target_rate, int _burst)
+        : limiter(NewGenericRateLimiter(_target_rate, 100 * 1000, 10)),
+          request_size(_target_rate / 10),
+          burst(_burst) {}
+    std::unique_ptr<RateLimiter> limiter;
+    int32_t request_size;
+    int burst;
+  };
+
+  auto writer = [](void* p) {
+    auto* thread_env = Env::Default();
+    auto* arg = static_cast<Arg*>(p);
+    // Test for 2 seconds
+    auto until = thread_env->NowMicros() + 2 * 1000000;
+    Random r((uint32_t)(thread_env->NowNanos() %
+                        std::numeric_limits<uint32_t>::max()));
+    while (thread_env->NowMicros() < until) {
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
+                              Env::IO_HIGH, nullptr /* stats */,
+                              RateLimiter::OpType::kWrite);
+      }
+      arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW,
+                            nullptr /* stats */, RateLimiter::OpType::kWrite);
+    }
+  };
+
+  for (int i = 1; i <= 16; i *= 2) {
+    int32_t target = i * 1024 * 10;
+    Arg arg(target, i / 4 + 1);
+    int64_t old_total_bytes_through = 0;
+    for (int iter = 1; iter <= 2; ++iter) {
+      // second iteration changes the target dynamically
+      if (iter == 2) {
+        target *= 2;
+        arg.limiter->SetBytesPerSecond(target);
+      }
+      auto start = env->NowMicros();
+      for (int t = 0; t < i; ++t) {
+        env->StartThread(writer, &arg);
+      }
+      env->WaitForJoin();
+
+      auto elapsed = env->NowMicros() - start;
+      double rate =
+          (arg.limiter->GetTotalBytesThrough() - old_total_bytes_through) *
+          1000000.0 / elapsed;
+      old_total_bytes_through = arg.limiter->GetTotalBytesThrough();
+      fprintf(stderr,
+              "request size [1 - %" PRIi32 "], limit %" PRIi32
+              " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n",
+              arg.request_size - 1, target / 1024, rate / 1024,
+              elapsed / 1000000.0);
+
+      ASSERT_GE(rate / target, 0.80);
+      ASSERT_LE(rate / target, 1.25);
+    }
+  }
+}
+#endif
+
+TEST_F(RateLimiterTest, LimitChangeTest) {
+  // starvation test when limit changes to a smaller value
+  int64_t refill_period = 1000 * 1000;
+  auto* env = Env::Default();
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  struct Arg {
+    Arg(int32_t _request_size, Env::IOPriority _pri,
+        std::shared_ptr<RateLimiter> _limiter)
+        : request_size(_request_size), pri(_pri), limiter(_limiter) {}
+    int32_t request_size;
+    Env::IOPriority pri;
+    std::shared_ptr<RateLimiter> limiter;
+  };
+
+  auto writer = [](void* p) {
+    auto* arg = static_cast<Arg*>(p);
+    arg->limiter->Request(arg->request_size, arg->pri, nullptr /* stats */,
+                          RateLimiter::OpType::kWrite);
+  };
+
+  for (uint32_t i = 1; i <= 16; i <<= 1) {
+    int32_t target = i * 1024 * 10;
+    // refill per second
+    for (int iter = 0; iter < 2; iter++) {
+      std::shared_ptr<RateLimiter> limiter =
+          std::make_shared<GenericRateLimiter>(
+              target, refill_period, 10, RateLimiter::Mode::kWritesOnly,
+              Env::Default(), false /* auto_tuned */);
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"GenericRateLimiter::Request",
+            "RateLimiterTest::LimitChangeTest:changeLimitStart"},
+           {"RateLimiterTest::LimitChangeTest:changeLimitEnd",
+            "GenericRateLimiter::Refill"}});
+      Arg arg(target, Env::IO_HIGH, limiter);
+      // The idea behind is to start a request first, then before it refills,
+      // update limit to a different value (2X/0.5X). No starvation should
+      // be guaranteed under any situation
+      // TODO(lightmark): more test cases are welcome.
+      env->StartThread(writer, &arg);
+      int32_t new_limit = (target << 1) >> (iter << 1);
+      TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitStart");
+      arg.limiter->SetBytesPerSecond(new_limit);
+      TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitEnd");
+      env->WaitForJoin();
+      fprintf(stderr,
+              "[COMPLETE] request size %" PRIi32 " KB, new limit %" PRIi32
+              "KB/sec, refill period %" PRIi64 " ms\n",
+              target / 1024, new_limit / 1024, refill_period / 1000);
+    }
+  }
+}
+
+TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) {
+  const std::chrono::seconds kTimePerRefill(1);
+  const int kRefillsPerTune = 100;  // needs to match util/rate_limiter.cc
+
+  SpecialEnv special_env(Env::Default());
+  special_env.no_slowdown_ = true;
+  special_env.time_elapse_only_sleep_ = true;
+
+  auto stats = CreateDBStatistics();
+  std::unique_ptr<RateLimiter> rate_limiter(new GenericRateLimiter(
+      1000 /* rate_bytes_per_sec */,
+      std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */,
+      RateLimiter::Mode::kWritesOnly, &special_env, true /* auto_tuned */));
+
+  // Use callback to advance time because we need to advance (1) after Request()
+  // has determined the bytes are not available; and (2) before Refill()
+  // computes the next refill time (ensuring refill time in the future allows
+  // the next request to drain the rate limiter).
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Refill", [&](void* /*arg*/) {
+        special_env.SleepForMicroseconds(static_cast<int>(
+            std::chrono::microseconds(kTimePerRefill).count()));
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // verify rate limit increases after a sequence of periods where rate limiter
+  // is always drained
+  int64_t orig_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+                        RateLimiter::OpType::kWrite);
+  while (std::chrono::microseconds(special_env.NowMicros()) <=
+         kRefillsPerTune * kTimePerRefill) {
+    rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+                          RateLimiter::OpType::kWrite);
+  }
+  int64_t new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  // decreases after a sequence of periods where rate limiter is not drained
+  orig_bytes_per_sec = new_bytes_per_sec;
+  special_env.SleepForMicroseconds(static_cast<int>(
+      kRefillsPerTune * std::chrono::microseconds(kTimePerRefill).count()));
+  // make a request so tuner can be triggered
+  rate_limiter->Request(1 /* bytes */, Env::IO_HIGH, stats.get(),
+                        RateLimiter::OpType::kWrite);
+  new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/repeatable_thread.h b/src/rocksdb/util/repeatable_thread.h
new file mode 100644
index 00000000..967cc499
--- /dev/null
+++ b/src/rocksdb/util/repeatable_thread.h
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/mock_time_env.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class RepeatableThread {
+ public:
+  RepeatableThread(std::function<void()> function,
+                   const std::string& thread_name, Env* env, uint64_t delay_us,
+                   uint64_t initial_delay_us = 0)
+      : function_(function),
+        thread_name_("rocksdb:" + thread_name),
+        env_(env),
+        delay_us_(delay_us),
+        initial_delay_us_(initial_delay_us),
+        mutex_(env),
+        cond_var_(&mutex_),
+        running_(true),
+#ifndef NDEBUG
+        waiting_(false),
+        run_count_(0),
+#endif
+        thread_([this] { thread(); }) {
+  }
+
+  void cancel() {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (!running_) {
+        return;
+      }
+      running_ = false;
+      cond_var_.SignalAll();
+    }
+    thread_.join();
+  }
+
+  bool IsRunning() { return running_; }
+
+  ~RepeatableThread() { cancel(); }
+
+#ifndef NDEBUG
+  // Wait until RepeatableThread starting waiting, call the optional callback,
+  // then wait for one run of RepeatableThread. Tests can use provide a
+  // custom env object to mock time, and use the callback here to bump current
+  // time and trigger RepeatableThread. See repeatable_thread_test for example.
+  //
+  // Note: only support one caller of this method.
+  void TEST_WaitForRun(std::function<void()> callback = nullptr) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!waiting_) {
+      cond_var_.Wait();
+    }
+    uint64_t prev_count = run_count_;
+    if (callback != nullptr) {
+      callback();
+    }
+    cond_var_.SignalAll();
+    while (!(run_count_ > prev_count)) {
+      cond_var_.Wait();
+    }
+  }
+#endif
+
+ private:
+  bool wait(uint64_t delay) {
+    InstrumentedMutexLock l(&mutex_);
+    if (running_ && delay > 0) {
+      uint64_t wait_until = env_->NowMicros() + delay;
+#ifndef NDEBUG
+      waiting_ = true;
+      cond_var_.SignalAll();
+#endif
+      while (running_) {
+        cond_var_.TimedWait(wait_until);
+        if (env_->NowMicros() >= wait_until) {
+          break;
+        }
+      }
+#ifndef NDEBUG
+      waiting_ = false;
+#endif
+    }
+    return running_;
+  }
+
+  void thread() {
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+    // Set thread name.
+    auto thread_handle = thread_.native_handle();
+    int ret __attribute__((__unused__)) =
+        pthread_setname_np(thread_handle, thread_name_.c_str());
+    assert(ret == 0);
+#endif
+#endif
+
+    assert(delay_us_ > 0);
+    if (!wait(initial_delay_us_)) {
+      return;
+    }
+    do {
+      function_();
+#ifndef NDEBUG
+      {
+        InstrumentedMutexLock l(&mutex_);
+        run_count_++;
+        cond_var_.SignalAll();
+      }
+#endif
+    } while (wait(delay_us_));
+  }
+
+  const std::function<void()> function_;
+  const std::string thread_name_;
+  Env* const env_;
+  const uint64_t delay_us_;
+  const uint64_t initial_delay_us_;
+
+  // Mutex lock should be held when accessing running_, waiting_
+  // and run_count_.
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cond_var_;
+  bool running_;
+#ifndef NDEBUG
+  // RepeatableThread waiting for timeout.
+  bool waiting_;
+  // Times function_ had run.
+  uint64_t run_count_;
+#endif
+  port::Thread thread_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/repeatable_thread_test.cc b/src/rocksdb/util/repeatable_thread_test.cc
new file mode 100644
index 00000000..ee853c10
--- /dev/null
+++ b/src/rocksdb/util/repeatable_thread_test.cc
@@ -0,0 +1,106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "util/repeatable_thread.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+
+class RepeatableThreadTest : public testing::Test {
+ public:
+  RepeatableThreadTest()
+      : mock_env_(new rocksdb::MockTimeEnv(rocksdb::Env::Default())) {}
+
+ protected:
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env_;
+};
+
+TEST_F(RepeatableThreadTest, TimedTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::port::Mutex mutex;
+  rocksdb::port::CondVar test_cv(&mutex);
+  int count = 0;
+  uint64_t prev_time = env->NowMicros();
+  rocksdb::RepeatableThread thread(
+      [&] {
+        rocksdb::MutexLock l(&mutex);
+        count++;
+        uint64_t now = env->NowMicros();
+        assert(count == 1 || prev_time + 1 * kSecond <= now);
+        prev_time = now;
+        if (count >= kIteration) {
+          test_cv.SignalAll();
+        }
+      },
+      "rt_test", env, 1 * kSecond);
+  // Wait for execution finish.
+  {
+    rocksdb::MutexLock l(&mutex);
+    while (count < kIteration) {
+      test_cv.Wait();
+    }
+  }
+
+  // Test cancel
+  thread.cancel();
+}
+
+TEST_F(RepeatableThreadTest, MockEnvTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  mock_env_->set_current_time(0);  // in seconds
+  std::atomic<int> count{0};
+
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        // Obtain the current (real) time in seconds and add 1000 extra seconds
+        // to ensure that RepeatableThread::wait invokes TimedWait with a time
+        // greater than (real) current time. This is to prevent the TimedWait
+        // function from returning immediately without sleeping and releasing
+        // the mutex on certain platforms, e.g. OS X. If TimedWait returns
+        // immediately, the mutex will not be released, and
+        // RepeatableThread::TEST_WaitForRun never has a chance to execute the
+        // callback which, in this case, updates the result returned by
+        // mock_env->NowMicros. Consequently, RepeatableThread::wait cannot
+        // break out of the loop, causing test to hang. The extra 1000 seconds
+        // is a best-effort approach because there seems no reliable and
+        // deterministic way to provide the aforementioned guarantee. By the
+        // time RepeatableThread::wait is called, it is no guarantee that the
+        // delay + mock_env->NowMicros will be greater than the current real
+        // time. However, 1000 seconds should be sufficient in most cases.
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env_->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env_->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  rocksdb::RepeatableThread thread([&] { count++; }, "rt_test", mock_env_.get(),
+                                   1 * kSecond, 1 * kSecond);
+  for (int i = 1; i <= kIteration; i++) {
+    // Bump current time
+    thread.TEST_WaitForRun([&] { mock_env_->set_current_time(i); });
+  }
+  // Test function should be exectued exactly kIteraion times.
+  ASSERT_EQ(kIteration, count.load());
+
+  // Test cancel
+  thread.cancel();
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/set_comparator.h b/src/rocksdb/util/set_comparator.h
new file mode 100644
index 00000000..4ecd0040
--- /dev/null
+++ b/src/rocksdb/util/set_comparator.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace rocksdb {
+// A comparator to be used in std::set
+struct SetComparator {
+  explicit SetComparator() : user_comparator_(BytewiseComparator()) {}
+  explicit SetComparator(const Comparator* user_comparator)
+      : user_comparator_(user_comparator ? user_comparator
+                                         : BytewiseComparator()) {}
+  bool operator()(const Slice& lhs, const Slice& rhs) const {
+    return user_comparator_->Compare(lhs, rhs) < 0;
+  }
+
+ private:
+  const Comparator* user_comparator_;
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/slice.cc b/src/rocksdb/util/slice.cc
new file mode 100644
index 00000000..5e23ae0a
--- /dev/null
+++ b/src/rocksdb/util/slice.cc
@@ -0,0 +1,212 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/slice.h"
+#include "util/string_util.h"
+#include <stdio.h>
+
+namespace rocksdb {
+
+namespace {
+
+class FixedPrefixTransform : public SliceTransform {
+ private:
+  size_t prefix_len_;
+  std::string name_;
+
+ public:
+  explicit FixedPrefixTransform(size_t prefix_len)
+      : prefix_len_(prefix_len),
+        // Note that if any part of the name format changes, it will require
+        // changes on options_helper in order to make RocksDBOptionsParser work
+        // for the new change.
+        // TODO(yhchiang): move serialization / deserializaion code inside
+        // the class implementation itself.
+        name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {}
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return Slice(src.data(), prefix_len_);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    return (src.size() >= prefix_len_);
+  }
+
+  bool InRange(const Slice& dst) const override {
+    return (dst.size() == prefix_len_);
+  }
+
+  bool FullLengthEnabled(size_t* len) const override {
+    *len = prefix_len_;
+    return true;
+  }
+
+  bool SameResultWhenAppended(const Slice& prefix) const override {
+    return InDomain(prefix);
+  }
+};
+
+class CappedPrefixTransform : public SliceTransform {
+ private:
+  size_t cap_len_;
+  std::string name_;
+
+ public:
+  explicit CappedPrefixTransform(size_t cap_len)
+      : cap_len_(cap_len),
+        // Note that if any part of the name format changes, it will require
+        // changes on options_helper in order to make RocksDBOptionsParser work
+        // for the new change.
+        // TODO(yhchiang): move serialization / deserializaion code inside
+        // the class implementation itself.
+        name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {}
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return Slice(src.data(), std::min(cap_len_, src.size()));
+  }
+
+  bool InDomain(const Slice& /*src*/) const override { return true; }
+
+  bool InRange(const Slice& dst) const override {
+    return (dst.size() <= cap_len_);
+  }
+
+  bool FullLengthEnabled(size_t* len) const override {
+    *len = cap_len_;
+    return true;
+  }
+
+  bool SameResultWhenAppended(const Slice& prefix) const override {
+    return prefix.size() >= cap_len_;
+  }
+};
+
+class NoopTransform : public SliceTransform {
+ public:
+  explicit NoopTransform() { }
+
+  const char* Name() const override { return "rocksdb.Noop"; }
+
+  Slice Transform(const Slice& src) const override { return src; }
+
+  bool InDomain(const Slice& /*src*/) const override { return true; }
+
+  bool InRange(const Slice& /*dst*/) const override { return true; }
+
+  bool SameResultWhenAppended(const Slice& /*prefix*/) const override {
+    return false;
+  }
+};
+
+}
+
+// 2 small internal utility functions, for efficient hex conversions
+// and no need for snprintf, toupper etc...
+// Originally from wdt/util/EncryptionUtils.cpp - for ToString(true)/DecodeHex:
+char toHex(unsigned char v) {
+  if (v <= 9) {
+    return '0' + v;
+  }
+  return 'A' + v - 10;
+}
+// most of the code is for validation/error check
+int fromHex(char c) {
+  // toupper:
+  if (c >= 'a' && c <= 'f') {
+    c -= ('a' - 'A');  // aka 0x20
+  }
+  // validation
+  if (c < '0' || (c > '9' && (c < 'A' || c > 'F'))) {
+    return -1;  // invalid not 0-9A-F hex char
+  }
+  if (c <= '9') {
+    return c - '0';
+  }
+  return c - 'A' + 10;
+}
+
+Slice::Slice(const SliceParts& parts, std::string* buf) {
+  size_t length = 0;
+  for (int i = 0; i < parts.num_parts; ++i) {
+    length += parts.parts[i].size();
+  }
+  buf->reserve(length);
+
+  for (int i = 0; i < parts.num_parts; ++i) {
+    buf->append(parts.parts[i].data(), parts.parts[i].size());
+  }
+  data_ = buf->data();
+  size_ = buf->size();
+}
+
+// Return a string that contains the copy of the referenced data.
+std::string Slice::ToString(bool hex) const {
+  std::string result;  // RVO/NRVO/move
+  if (hex) {
+    result.reserve(2 * size_);
+    for (size_t i = 0; i < size_; ++i) {
+      unsigned char c = data_[i];
+      result.push_back(toHex(c >> 4));
+      result.push_back(toHex(c & 0xf));
+    }
+    return result;
+  } else {
+    result.assign(data_, size_);
+    return result;
+  }
+}
+
+// Originally from rocksdb/utilities/ldb_cmd.h
+bool Slice::DecodeHex(std::string* result) const {
+  std::string::size_type len = size_;
+  if (len % 2) {
+    // Hex string must be even number of hex digits to get complete bytes back
+    return false;
+  }
+  if (!result) {
+    return false;
+  }
+  result->clear();
+  result->reserve(len / 2);
+
+  for (size_t i = 0; i < len;) {
+    int h1 = fromHex(data_[i++]);
+    if (h1 < 0) {
+      return false;
+    }
+    int h2 = fromHex(data_[i++]);
+    if (h2 < 0) {
+      return false;
+    }
+    result->push_back(static_cast<char>((h1 << 4) | h2));
+  }
+  return true;
+}
+
+const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
+  return new FixedPrefixTransform(prefix_len);
+}
+
+const SliceTransform* NewCappedPrefixTransform(size_t cap_len) {
+  return new CappedPrefixTransform(cap_len);
+}
+
+const SliceTransform* NewNoopTransform() {
+  return new NoopTransform;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/slice_transform_test.cc b/src/rocksdb/util/slice_transform_test.cc
new file mode 100644
index 00000000..f91675cc
--- /dev/null
+++ b/src/rocksdb/util/slice_transform_test.cc
@@ -0,0 +1,153 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice_transform.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class SliceTransformTest : public testing::Test {};
+
+TEST_F(SliceTransformTest, CapPrefixTransform) {
+  std::string s;
+  s = "abcdefge";
+
+  std::unique_ptr<const SliceTransform> transform;
+
+  transform.reset(NewCappedPrefixTransform(6));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdef");
+  ASSERT_TRUE(transform->SameResultWhenAppended("123456"));
+  ASSERT_TRUE(transform->SameResultWhenAppended("1234567"));
+  ASSERT_TRUE(!transform->SameResultWhenAppended("12345"));
+
+  transform.reset(NewCappedPrefixTransform(8));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(10));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform(s).ToString(), "");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform("").ToString(), "");
+}
+
+class SliceTransformDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
+    dbname_ = test::PerThreadDBPath("slice_transform_db_test");
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~SliceTransformDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  DB* db() { return db_; }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+
+  Options last_options_;
+};
+
+namespace {
+uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+}  // namespace
+
+TEST_F(SliceTransformDBTest, CapPrefix) {
+  last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8));
+  last_options_.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(TryReopen());
+
+  ReadOptions ro;
+  FlushOptions fo;
+  WriteOptions wo;
+
+  ASSERT_OK(db()->Put(wo, "barbarbar", "foo"));
+  ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2"));
+  ASSERT_OK(db()->Put(wo, "foo", "bar"));
+  ASSERT_OK(db()->Put(wo, "foo3", "bar3"));
+  ASSERT_OK(db()->Flush(fo));
+
+  std::unique_ptr<Iterator> iter(db()->NewIterator(ro));
+
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "bar");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U);
+
+  iter->Seek("foo2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barbarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "foo");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barfoofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U);
+
+  iter->Seek("foobarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/sst_file_manager_impl.cc b/src/rocksdb/util/sst_file_manager_impl.cc
new file mode 100644
index 00000000..6a770b10
--- /dev/null
+++ b/src/rocksdb/util/sst_file_manager_impl.cc
@@ -0,0 +1,527 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/sst_file_manager_impl.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <vector>
+
+#include "db/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/sst_file_manager.h"
+#include "util/mutexlock.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger,
+                                       int64_t rate_bytes_per_sec,
+                                       double max_trash_db_ratio,
+                                       uint64_t bytes_max_delete_chunk)
+    : env_(env),
+      logger_(logger),
+      total_files_size_(0),
+      in_progress_files_size_(0),
+      compaction_buffer_size_(0),
+      cur_compactions_reserved_size_(0),
+      max_allowed_space_(0),
+      delete_scheduler_(env, rate_bytes_per_sec, logger.get(), this,
+                        max_trash_db_ratio, bytes_max_delete_chunk),
+      cv_(&mu_),
+      closing_(false),
+      bg_thread_(nullptr),
+      reserved_disk_buffer_(0),
+      free_space_trigger_(0),
+      cur_instance_(nullptr) {
+}
+
+SstFileManagerImpl::~SstFileManagerImpl() {
+  Close();
+}
+
+void SstFileManagerImpl::Close() {
+  {
+    MutexLock l(&mu_);
+    if (closing_) {
+      return;
+    }
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+                                     bool compaction) {
+  uint64_t file_size;
+  Status s = env_->GetFileSize(file_path, &file_size);
+  if (s.ok()) {
+    MutexLock l(&mu_);
+    OnAddFileImpl(file_path, file_size, compaction);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+  return s;
+}
+
+Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
+  {
+    MutexLock l(&mu_);
+    OnDeleteFileImpl(file_path);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile");
+  return Status::OK();
+}
+
+void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  for (size_t i = 0; i < c->num_input_levels(); i++) {
+    for (size_t j = 0; j < c->num_input_files(i); j++) {
+      FileMetaData* filemeta = c->input(i, j);
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+  cur_compactions_reserved_size_ -= size_added_by_compaction;
+
+  auto new_files = c->edit()->GetNewFiles();
+  for (auto& new_file : new_files) {
+    auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+                            new_file.second.fd.GetNumber(),
+                            new_file.second.fd.GetPathId());
+    if (in_progress_files_.find(fn) != in_progress_files_.end()) {
+      auto tracked_file = tracked_files_.find(fn);
+      assert(tracked_file != tracked_files_.end());
+      in_progress_files_size_ -= tracked_file->second;
+      in_progress_files_.erase(fn);
+    }
+  }
+}
+
+Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
+                                      const std::string& new_path,
+                                      uint64_t* file_size) {
+  {
+    MutexLock l(&mu_);
+    if (file_size != nullptr) {
+      *file_size = tracked_files_[old_path];
+    }
+    OnAddFileImpl(new_path, tracked_files_[old_path], false);
+    OnDeleteFileImpl(old_path);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
+  return Status::OK();
+}
+
+void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
+  MutexLock l(&mu_);
+  max_allowed_space_ = max_allowed_space;
+}
+
+void SstFileManagerImpl::SetCompactionBufferSize(
+    uint64_t compaction_buffer_size) {
+  MutexLock l(&mu_);
+  compaction_buffer_size_ = compaction_buffer_size;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ >= max_allowed_space_;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ + cur_compactions_reserved_size_ >=
+         max_allowed_space_;
+}
+
+bool SstFileManagerImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    Status bg_error) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  // First check if we even have the space to do the compaction
+  for (size_t i = 0; i < inputs.size(); i++) {
+    for (size_t j = 0; j < inputs[i].size(); j++) {
+      FileMetaData* filemeta = inputs[i][j];
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+
+  // Update cur_compactions_reserved_size_ so concurrent compaction
+  // don't max out space
+  size_t needed_headroom =
+      cur_compactions_reserved_size_ + size_added_by_compaction +
+      compaction_buffer_size_;
+  if (max_allowed_space_ != 0 &&
+      (needed_headroom + total_files_size_ > max_allowed_space_)) {
+    return false;
+  }
+
+  // Implement more aggressive checks only if this DB instance has already
+  // seen a NoSpace() error. This is tin order to contain a single potentially
+  // misbehaving DB instance and prevent it from slowing down compactions of
+  // other DB instances
+  if (CheckFreeSpace() && bg_error == Status::NoSpace()) {
+    auto fn =
+        TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
+                      inputs[0][0]->fd.GetPathId());
+    uint64_t free_space = 0;
+    env_->GetFreeSpace(fn, &free_space);
+    // needed_headroom is based on current size reserved by compactions,
+    // minus any files created by running compactions as they would count
+    // against the reserved size. If user didn't specify any compaction
+    // buffer, add reserved_disk_buffer_ that's calculated by default so the
+    // compaction doesn't end up leaving nothing for logs and flush SSTs
+    if (compaction_buffer_size_ == 0) {
+      needed_headroom += reserved_disk_buffer_;
+    }
+    needed_headroom -= in_progress_files_size_;
+    if (free_space < needed_headroom + size_added_by_compaction) {
+      // We hit the condition of not enough disk space
+      ROCKS_LOG_ERROR(logger_,
+                      "free space [%" PRIu64
+                      " bytes] is less than "
+                      "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
+                      free_space, needed_headroom);
+      return false;
+    }
+  }
+
+  cur_compactions_reserved_size_ += size_added_by_compaction;
+  // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
+  // a NoSpace error.
+  free_space_trigger_ = cur_compactions_reserved_size_;
+  return true;
+}
+
+uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
+  MutexLock l(&mu_);
+  return cur_compactions_reserved_size_;
+}
+
+uint64_t SstFileManagerImpl::GetTotalSize() {
+  MutexLock l(&mu_);
+  return total_files_size_;
+}
+
+std::unordered_map<std::string, uint64_t>
+SstFileManagerImpl::GetTrackedFiles() {
+  MutexLock l(&mu_);
+  return tracked_files_;
+}
+
+int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
+  return delete_scheduler_.GetRateBytesPerSecond();
+}
+
+void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
+  return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
+}
+
+double SstFileManagerImpl::GetMaxTrashDBRatio() {
+  return delete_scheduler_.GetMaxTrashDBRatio();
+}
+
+void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
+  return delete_scheduler_.SetMaxTrashDBRatio(r);
+}
+
+uint64_t SstFileManagerImpl::GetTotalTrashSize() {
+  return delete_scheduler_.GetTotalTrashSize();
+}
+
+void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
+                                           const std::string& path) {
+  MutexLock l(&mu_);
+
+  reserved_disk_buffer_ += size;
+  if (path_.empty()) {
+    path_ = path;
+  }
+}
+
+void SstFileManagerImpl::ClearError() {
+  while (true) {
+    MutexLock l(&mu_);
+
+    if (closing_) {
+      return;
+    }
+
+    uint64_t free_space;
+    Status s = env_->GetFreeSpace(path_, &free_space);
+    if (s.ok()) {
+      // In case of multi-DB instances, some of them may have experienced a
+      // soft error and some a hard error. In the SstFileManagerImpl, a hard
+      // error will basically override previously reported soft errors. Once
+      // we clear the hard error, we don't keep track of previous errors for
+      // now
+      if (bg_err_.severity() == Status::Severity::kHardError) {
+        if (free_space < reserved_disk_buffer_) {
+          ROCKS_LOG_ERROR(logger_,
+                          "free space [%" PRIu64
+                          " bytes] is less than "
+                          "required disk buffer [%" PRIu64 " bytes]\n",
+                          free_space, reserved_disk_buffer_);
+          ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
+          s = Status::NoSpace();
+        }
+      } else if (bg_err_.severity() == Status::Severity::kSoftError) {
+        if (free_space < free_space_trigger_) {
+          ROCKS_LOG_WARN(logger_,
+                         "free space [%" PRIu64
+                         " bytes] is less than "
+                         "free space for compaction trigger [%" PRIu64
+                         " bytes]\n",
+                         free_space, free_space_trigger_);
+          ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
+          s = Status::NoSpace();
+        }
+      }
+    }
+
+    // Someone could have called CancelErrorRecovery() and the list could have
+    // become empty, so check again here
+    if (s.ok() && !error_handler_list_.empty()) {
+      auto error_handler = error_handler_list_.front();
+      // Since we will release the mutex, set cur_instance_ to signal to the
+      // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
+      // to indicate that this DB instance is busy. The DB instance is
+      // guaranteed to not be deleted before RecoverFromBGError() returns,
+      // since the ErrorHandler::recovery_in_prog_ flag would be true
+      cur_instance_ = error_handler;
+      mu_.Unlock();
+      s = error_handler->RecoverFromBGError();
+      mu_.Lock();
+      // The DB instance might have been deleted while we were
+      // waiting for the mutex, so check cur_instance_ to make sure its
+      // still non-null
+      if (cur_instance_) {
+        // Check for error again, since the instance may have recovered but
+        // immediately got another error. If that's the case, and the new
+        // error is also a NoSpace() non-fatal error, leave the instance in
+        // the list
+        Status err = cur_instance_->GetBGError();
+        if (s.ok() && err == Status::NoSpace() &&
+            err.severity() < Status::Severity::kFatalError) {
+          s = err;
+        }
+        cur_instance_ = nullptr;
+      }
+
+      if (s.ok() || s.IsShutdownInProgress() ||
+          (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
+        // If shutdown is in progress, abandon this handler instance
+        // and continue with the others
+        error_handler_list_.pop_front();
+      }
+    }
+
+    if (!error_handler_list_.empty()) {
+      // If there are more instances to be recovered, reschedule after 5
+      // seconds
+      int64_t wait_until = env_->NowMicros() + 5000000;
+      cv_.TimedWait(wait_until);
+    }
+
+    // Check again for error_handler_list_ empty, as a DB instance shutdown
+    // could have removed it from the queue while we were in timed wait
+    if (error_handler_list_.empty()) {
+      ROCKS_LOG_INFO(logger_, "Clearing error\n");
+      bg_err_ = Status::OK();
+      return;
+    }
+  }
+}
+
+void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
+                                            Status bg_error) {
+  MutexLock l(&mu_);
+  if (bg_error.severity() == Status::Severity::kSoftError) {
+    if (bg_err_.ok()) {
+      // Setting bg_err_ basically means we're in degraded mode
+      // Assume that all pending compactions will fail similarly. The trigger
+      // for clearing this condition is set to current compaction reserved
+      // size, so we stop checking disk space available in
+      // EnoughRoomForCompaction once this much free space is available
+      bg_err_ = bg_error;
+    }
+  } else if (bg_error.severity() == Status::Severity::kHardError) {
+    bg_err_ = bg_error;
+  } else {
+    assert(false);
+  }
+
+  // If this is the first instance of this error, kick of a thread to poll
+  // and recover from this condition
+  if (error_handler_list_.empty()) {
+    error_handler_list_.push_back(handler);
+    // Release lock before calling join. Its ok to do so because
+    // error_handler_list_ is now non-empty, so no other invocation of this
+    // function will execute this piece of code
+    mu_.Unlock();
+    if (bg_thread_) {
+      bg_thread_->join();
+    }
+    // Start a new thread. The previous one would have exited.
+    bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
+    mu_.Lock();
+  } else {
+    // Check if this DB instance is already in the list
+    for (auto iter = error_handler_list_.begin();
+         iter != error_handler_list_.end(); ++iter) {
+      if ((*iter) == handler) {
+        return;
+      }
+    }
+    error_handler_list_.push_back(handler);
+  }
+}
+
+bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
+  MutexLock l(&mu_);
+
+  if (cur_instance_ == handler) {
+    // This instance is currently busy attempting to recover
+    // Nullify it so the recovery thread doesn't attempt to access it again
+    cur_instance_ = nullptr;
+    return false;
+  }
+
+  for (auto iter = error_handler_list_.begin();
+       iter != error_handler_list_.end(); ++iter) {
+    if ((*iter) == handler) {
+      error_handler_list_.erase(iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+Status SstFileManagerImpl::ScheduleFileDeletion(
+    const std::string& file_path, const std::string& path_to_sync,
+    const bool force_bg) {
+  TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion");
+  return delete_scheduler_.DeleteFile(file_path, path_to_sync,
+                                      force_bg);
+}
+
+void SstFileManagerImpl::WaitForEmptyTrash() {
+  delete_scheduler_.WaitForEmptyTrash();
+}
+
+void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
+                                       uint64_t file_size, bool compaction) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file != tracked_files_.end()) {
+    // File was added before, we will just update the size
+    assert(!compaction);
+    total_files_size_ -= tracked_file->second;
+    total_files_size_ += file_size;
+    cur_compactions_reserved_size_ -= file_size;
+  } else {
+    total_files_size_ += file_size;
+    if (compaction) {
+      // Keep track of the size of files created by in-progress compactions.
+      // When calculating whether there's enough headroom for new compactions,
+      // this will be subtracted from cur_compactions_reserved_size_.
+      // Otherwise, compactions will be double counted.
+      in_progress_files_size_ += file_size;
+      in_progress_files_.insert(file_path);
+    }
+  }
+  tracked_files_[file_path] = file_size;
+}
+
+void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file == tracked_files_.end()) {
+    // File is not tracked
+    assert(in_progress_files_.find(file_path) == in_progress_files_.end());
+    return;
+  }
+
+  total_files_size_ -= tracked_file->second;
+  // Check if it belonged to an in-progress compaction
+  if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
+    in_progress_files_size_ -= tracked_file->second;
+    in_progress_files_.erase(file_path);
+  }
+  tracked_files_.erase(tracked_file);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
+                                  std::string trash_dir,
+                                  int64_t rate_bytes_per_sec,
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
+  SstFileManagerImpl* res =
+      new SstFileManagerImpl(env, info_log, rate_bytes_per_sec,
+                             max_trash_db_ratio, bytes_max_delete_chunk);
+
+  // trash_dir is deprecated and not needed anymore, but if user passed it
+  // we will still remove files in it.
+  Status s;
+  if (delete_existing_trash && trash_dir != "") {
+    std::vector<std::string> files_in_trash;
+    s = env->GetChildren(trash_dir, &files_in_trash);
+    if (s.ok()) {
+      for (const std::string& trash_file : files_in_trash) {
+        if (trash_file == "." || trash_file == "..") {
+          continue;
+        }
+
+        std::string path_in_trash = trash_dir + "/" + trash_file;
+        res->OnAddFile(path_in_trash);
+        Status file_delete =
+            res->ScheduleFileDeletion(path_in_trash, trash_dir);
+        if (s.ok() && !file_delete.ok()) {
+          s = file_delete;
+        }
+      }
+    }
+  }
+
+  if (status) {
+    *status = s;
+  }
+
+  return res;
+}
+
+#else
+
+SstFileManager* NewSstFileManager(Env* /*env*/,
+                                  std::shared_ptr<Logger> /*info_log*/,
+                                  std::string /*trash_dir*/,
+                                  int64_t /*rate_bytes_per_sec*/,
+                                  bool /*delete_existing_trash*/,
+                                  Status* status, double /*max_trash_db_ratio*/,
+                                  uint64_t /*bytes_max_delete_chunk*/) {
+  if (status) {
+    *status =
+        Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
+  }
+  return nullptr;
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/sst_file_manager_impl.h b/src/rocksdb/util/sst_file_manager_impl.h
new file mode 100644
index 00000000..211b4fa7
--- /dev/null
+++ b/src/rocksdb/util/sst_file_manager_impl.h
@@ -0,0 +1,189 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "port/port.h"
+
+#include "db/compaction.h"
+#include "db/error_handler.h"
+#include "rocksdb/sst_file_manager.h"
+#include "util/delete_scheduler.h"
+
+namespace rocksdb {
+
+class Env;
+class Logger;
+
+// SstFileManager is used to track SST files in the DB and control there
+// deletion rate.
+// All SstFileManager public functions are thread-safe.
+class SstFileManagerImpl : public SstFileManager {
+ public:
+  explicit SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger,
+                              int64_t rate_bytes_per_sec,
+                              double max_trash_db_ratio,
+                              uint64_t bytes_max_delete_chunk);
+
+  ~SstFileManagerImpl();
+
+  // DB will call OnAddFile whenever a new sst file is added.
+  Status OnAddFile(const std::string& file_path, bool compaction = false);
+
+  // DB will call OnDeleteFile whenever an sst file is deleted.
+  Status OnDeleteFile(const std::string& file_path);
+
+  // DB will call OnMoveFile whenever an sst file is move to a new path.
+  Status OnMoveFile(const std::string& old_path, const std::string& new_path,
+                    uint64_t* file_size = nullptr);
+
+  // Update the maximum allowed space that should be used by RocksDB, if
+  // the total size of the SST files exceeds max_allowed_space, writes to
+  // RocksDB will fail.
+  //
+  // Setting max_allowed_space to 0 will disable this feature, maximum allowed
+  // space will be infinite (Default value).
+  //
+  // thread-safe.
+  void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override;
+
+  void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
+
+  // Return true if the total size of SST files exceeded the maximum allowed
+  // space usage.
+  //
+  // thread-safe.
+  bool IsMaxAllowedSpaceReached() override;
+
+  bool IsMaxAllowedSpaceReachedIncludingCompactions() override;
+
+  // Returns true is there is enough (approximate) space for the specified
+  // compaction. Space is approximate because this function conservatively
+  // estimates how much space is currently being used by compactions (i.e.
+  // if a compaction has started, this function bumps the used space by
+  // the full compaction size).
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               Status bg_error);
+
+  // Bookkeeping so total_file_sizes_ goes back to normal after compaction
+  // finishes
+  void OnCompactionCompletion(Compaction* c);
+
+  uint64_t GetCompactionsReservedSize();
+
+  // Return the total size of all tracked files.
+  uint64_t GetTotalSize() override;
+
+  // Return a map containing all tracked files and there corresponding sizes.
+  std::unordered_map<std::string, uint64_t> GetTrackedFiles() override;
+
+  // Return delete rate limit in bytes per second.
+  virtual int64_t GetDeleteRateBytesPerSecond() override;
+
+  // Update the delete rate limit in bytes per second.
+  virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override;
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  virtual double GetMaxTrashDBRatio() override;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  virtual void SetMaxTrashDBRatio(double ratio) override;
+
+  // Return the total size of trash files
+  uint64_t GetTotalTrashSize() override;
+
+  // Called by each DB instance using this sst file manager to reserve
+  // disk buffer space for recovery from out of space errors
+  void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
+
+  // Set a flag upon encountering disk full. May enqueue the ErrorHandler
+  // instance for background polling and recovery
+  void StartErrorRecovery(ErrorHandler* db, Status bg_error);
+
+  // Remove the given Errorhandler instance from the recovery queue. Its
+  // not guaranteed
+  bool CancelErrorRecovery(ErrorHandler* db);
+
+  // Mark file as trash and schedule it's deletion. If force_bg is set, it
+  // forces the file to be deleting in the background regardless of DB size,
+  // except when rate limited delete is disabled
+  virtual Status ScheduleFileDeletion(const std::string& file_path,
+                                      const std::string& dir_to_sync,
+                                      const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  virtual void WaitForEmptyTrash();
+
+  DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
+
+  // Stop the error recovery background thread. This should be called only
+  // once in the object's lifetime, and before the destructor
+  void Close();
+
+ private:
+  // REQUIRES: mutex locked
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size,
+                     bool compaction);
+  // REQUIRES: mutex locked
+  void OnDeleteFileImpl(const std::string& file_path);
+
+  void ClearError();
+  bool CheckFreeSpace() {
+    return bg_err_.severity() == Status::Severity::kSoftError;
+  }
+
+  Env* env_;
+  std::shared_ptr<Logger> logger_;
+  // Mutex to protect tracked_files_, total_files_size_
+  port::Mutex mu_;
+  // The summation of the sizes of all files in tracked_files_ map
+  uint64_t total_files_size_;
+  // The summation of all output files of in-progress compactions
+  uint64_t in_progress_files_size_;
+  // Compactions should only execute if they can leave at least
+  // this amount of buffer space for logs and flushes
+  uint64_t compaction_buffer_size_;
+  // Estimated size of the current ongoing compactions
+  uint64_t cur_compactions_reserved_size_;
+  // A map containing all tracked files and there sizes
+  //  file_path => file_size
+  std::unordered_map<std::string, uint64_t> tracked_files_;
+  // A set of files belonging to in-progress compactions
+  std::unordered_set<std::string> in_progress_files_;
+  // The maximum allowed space (in bytes) for sst files.
+  uint64_t max_allowed_space_;
+  // DeleteScheduler used to throttle file deletition.
+  DeleteScheduler delete_scheduler_;
+  port::CondVar cv_;
+  // Flag to force error recovery thread to exit
+  bool closing_;
+  // Background error recovery thread
+  std::unique_ptr<port::Thread> bg_thread_;
+  // A path in the filesystem corresponding to this SFM. This is used for
+  // calling Env::GetFreeSpace. Posix requires a path in the filesystem
+  std::string path_;
+  // Save the current background error
+  Status bg_err_;
+  // Amount of free disk headroom before allowing recovery from hard errors
+  uint64_t reserved_disk_buffer_;
+  // For soft errors, amount of free disk space before we can allow
+  // compactions to run full throttle. If disk space is below this trigger,
+  // compactions will be gated by free disk space > input size
+  uint64_t free_space_trigger_;
+  // List of database error handler instances tracked by this sst file manager
+  std::list<ErrorHandler*> error_handler_list_;
+  // Pointer to ErrorHandler instance that is currently processing recovery
+  ErrorHandler* cur_instance_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/status.cc b/src/rocksdb/util/status.cc
new file mode 100644
index 00000000..c66bf6f8
--- /dev/null
+++ b/src/rocksdb/util/status.cc
@@ -0,0 +1,131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/status.h"
+#include <stdio.h>
+#ifdef OS_WIN
+#include <string.h>
+#endif
+#include <cstring>
+#include "port/port.h"
+
+namespace rocksdb {
+
+const char* Status::CopyState(const char* state) {
+#ifdef OS_WIN
+  const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
+  char* result = new char[cch];
+  errno_t ret;
+  ret = strncpy_s(result, cch, state, cch - 1);
+  result[cch - 1] = '\0';
+  assert(ret == 0);
+  return result;
+#else
+  const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
+  return std::strncpy(new char[cch], state, cch);
+#endif
+}
+
+static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
+    "",                                                   // kNone
+    "Timeout Acquiring Mutex",                            // kMutexTimeout
+    "Timeout waiting to lock key",                        // kLockTimeout
+    "Failed to acquire lock due to max_num_locks limit",  // kLockLimit
+    "No space left on device",                            // kNoSpace
+    "Deadlock",                                           // kDeadlock
+    "Stale file handle",                                  // kStaleFile
+    "Memory limit reached",                               // kMemoryLimit
+    "Space limit reached",                                // kSpaceLimit
+    "No such file or directory",                          // kPathNotFound
+};
+
+Status::Status(Code _code, SubCode _subcode, const Slice& msg,
+               const Slice& msg2)
+    : code_(_code), subcode_(_subcode), sev_(kNoError) {
+  assert(code_ != kOk);
+  assert(subcode_ != kMaxSubCode);
+  const size_t len1 = msg.size();
+  const size_t len2 = msg2.size();
+  const size_t size = len1 + (len2 ? (2 + len2) : 0);
+  char* const result = new char[size + 1];  // +1 for null terminator
+  memcpy(result, msg.data(), len1);
+  if (len2) {
+    result[len1] = ':';
+    result[len1 + 1] = ' ';
+    memcpy(result + len1 + 2, msg2.data(), len2);
+  }
+  result[size] = '\0';  // null terminator for C style string
+  state_ = result;
+}
+
+std::string Status::ToString() const {
+  char tmp[30];
+  const char* type;
+  switch (code_) {
+    case kOk:
+      return "OK";
+    case kNotFound:
+      type = "NotFound: ";
+      break;
+    case kCorruption:
+      type = "Corruption: ";
+      break;
+    case kNotSupported:
+      type = "Not implemented: ";
+      break;
+    case kInvalidArgument:
+      type = "Invalid argument: ";
+      break;
+    case kIOError:
+      type = "IO error: ";
+      break;
+    case kMergeInProgress:
+      type = "Merge in progress: ";
+      break;
+    case kIncomplete:
+      type = "Result incomplete: ";
+      break;
+    case kShutdownInProgress:
+      type = "Shutdown in progress: ";
+      break;
+    case kTimedOut:
+      type = "Operation timed out: ";
+      break;
+    case kAborted:
+      type = "Operation aborted: ";
+      break;
+    case kBusy:
+      type = "Resource busy: ";
+      break;
+    case kExpired:
+      type = "Operation expired: ";
+      break;
+    case kTryAgain:
+      type = "Operation failed. Try again.: ";
+      break;
+    default:
+      snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
+               static_cast<int>(code()));
+      type = tmp;
+      break;
+  }
+  std::string result(type);
+  if (subcode_ != kNone) {
+    uint32_t index = static_cast<int32_t>(subcode_);
+    assert(sizeof(msgs) > index);
+    result.append(msgs[index]);
+  }
+
+  if (state_ != nullptr) {
+    result.append(state_);
+  }
+  return result;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/stderr_logger.h b/src/rocksdb/util/stderr_logger.h
new file mode 100644
index 00000000..8612fce0
--- /dev/null
+++ b/src/rocksdb/util/stderr_logger.h
@@ -0,0 +1,31 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+// Prints logs to stderr for faster debugging
+class StderrLogger : public Logger {
+ public:
+  explicit StderrLogger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : Logger(log_level) {}
+
+  // Brings overloaded Logv()s into scope so they're not hidden when we override
+  // a subset of them.
+  using Logger::Logv;
+
+  virtual void Logv(const char* format, va_list ap) override {
+    vfprintf(stderr, format, ap);
+    fprintf(stderr, "\n");
+  }
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/stop_watch.h b/src/rocksdb/util/stop_watch.h
new file mode 100644
index 00000000..afa708e3
--- /dev/null
+++ b/src/rocksdb/util/stop_watch.h
@@ -0,0 +1,118 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "monitoring/statistics.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+// Auto-scoped.
+// Records the measure time into the corresponding histogram if statistics
+// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr
+// and overwrite is true, it will be added to *elapsed if overwrite is false.
+class StopWatch {
+ public:
+  StopWatch(Env* const env, Statistics* statistics, const uint32_t hist_type,
+            uint64_t* elapsed = nullptr, bool overwrite = true,
+            bool delay_enabled = false)
+      : env_(env),
+        statistics_(statistics),
+        hist_type_(hist_type),
+        elapsed_(elapsed),
+        overwrite_(overwrite),
+        stats_enabled_(statistics &&
+                       statistics->get_stats_level() >=
+                           StatsLevel::kExceptTimers &&
+                       statistics->HistEnabledForType(hist_type)),
+        delay_enabled_(delay_enabled),
+        total_delay_(0),
+        delay_start_time_(0),
+        start_time_((stats_enabled_ || elapsed != nullptr) ? env->NowMicros()
+                                                           : 0) {}
+
+  ~StopWatch() {
+    if (elapsed_) {
+      if (overwrite_) {
+        *elapsed_ = env_->NowMicros() - start_time_;
+      } else {
+        *elapsed_ += env_->NowMicros() - start_time_;
+      }
+    }
+    if (elapsed_ && delay_enabled_) {
+      *elapsed_ -= total_delay_;
+    }
+    if (stats_enabled_) {
+      statistics_->reportTimeToHistogram(
+          hist_type_, (elapsed_ != nullptr)
+                          ? *elapsed_
+                          : (env_->NowMicros() - start_time_));
+    }
+  }
+
+  void DelayStart() {
+    // if delay_start_time_ is not 0, it means we are already tracking delay,
+    // so delay_start_time_ should not be overwritten
+    if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) {
+      delay_start_time_ = env_->NowMicros();
+    }
+  }
+
+  void DelayStop() {
+    if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) {
+      total_delay_ += env_->NowMicros() - delay_start_time_;
+    }
+    // reset to 0 means currently no delay is being tracked, so two consecutive
+    // calls to DelayStop will not increase total_delay_
+    delay_start_time_ = 0;
+  }
+
+  uint64_t GetDelay() const { return delay_enabled_ ? total_delay_ : 0; }
+
+  uint64_t start_time() const { return start_time_; }
+
+ private:
+  Env* const env_;
+  Statistics* statistics_;
+  const uint32_t hist_type_;
+  uint64_t* elapsed_;
+  bool overwrite_;
+  bool stats_enabled_;
+  bool delay_enabled_;
+  uint64_t total_delay_;
+  uint64_t delay_start_time_;
+  const uint64_t start_time_;
+};
+
+// a nano second precision stopwatch
+class StopWatchNano {
+ public:
+  explicit StopWatchNano(Env* const env, bool auto_start = false)
+      : env_(env), start_(0) {
+    if (auto_start) {
+      Start();
+    }
+  }
+
+  void Start() { start_ = env_->NowNanos(); }
+
+  uint64_t ElapsedNanos(bool reset = false) {
+    auto now = env_->NowNanos();
+    auto elapsed = now - start_;
+    if (reset) {
+      start_ = now;
+    }
+    return elapsed;
+  }
+
+  uint64_t ElapsedNanosSafe(bool reset = false) {
+    return (env_ != nullptr) ? ElapsedNanos(reset) : 0U;
+  }
+
+ private:
+  Env* const env_;
+  uint64_t start_;
+};
+
+} // namespace rocksdb
diff --git a/src/rocksdb/util/string_util.cc b/src/rocksdb/util/string_util.cc
new file mode 100644
index 00000000..26e6759a
--- /dev/null
+++ b/src/rocksdb/util/string_util.cc
@@ -0,0 +1,403 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "util/string_util.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <cmath>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+const std::string kNullptrString = "nullptr";
+
+std::vector<std::string> StringSplit(const std::string& arg, char delim) {
+  std::vector<std::string> splits;
+  std::stringstream ss(arg);
+  std::string item;
+  while (std::getline(ss, item, delim)) {
+    splits.push_back(item);
+  }
+  return splits;
+}
+
+// for micros < 10ms, print "XX us".
+// for micros < 10sec, print "XX ms".
+// for micros >= 10 sec, print "XX sec".
+// for micros <= 1 hour, print Y:X M:S".
+// for micros > 1 hour, print Z:Y:X H:M:S".
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format) {
+  if (micros < 10000 && !fixed_format) {
+    return snprintf(output, len, "%" PRIu64 " us", micros);
+  } else if (micros < 10000000 && !fixed_format) {
+    return snprintf(output, len, "%.3lf ms",
+                    static_cast<double>(micros) / 1000);
+  } else if (micros < 1000000l * 60 && !fixed_format) {
+    return snprintf(output, len, "%.3lf sec",
+                    static_cast<double>(micros) / 1000000);
+  } else if (micros < 1000000ll * 60 * 60 && !fixed_format) {
+    return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S",
+                    micros / 1000000 / 60,
+                    static_cast<double>(micros % 60000000) / 1000000);
+  } else {
+    return snprintf(output, len, "%02" PRIu64 ":%02" PRIu64 ":%05.3f H:M:S",
+                    micros / 1000000 / 3600, (micros / 1000000 / 60) % 60,
+                    static_cast<double>(micros % 60000000) / 1000000);
+  }
+}
+
+// for sizes >=10TB, print "XXTB"
+// for sizes >=10GB, print "XXGB"
+// etc.
+// append file size summary to output and return the len
+int AppendHumanBytes(uint64_t bytes, char* output, int len) {
+  const uint64_t ull10 = 10;
+  if (bytes >= ull10 << 40) {
+    return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40);
+  } else if (bytes >= ull10 << 30) {
+    return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30);
+  } else if (bytes >= ull10 << 20) {
+    return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20);
+  } else if (bytes >= ull10 << 10) {
+    return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10);
+  } else {
+    return snprintf(output, len, "%" PRIu64 "B", bytes);
+  }
+}
+
+void AppendNumberTo(std::string* str, uint64_t num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%" PRIu64, num);
+  str->append(buf);
+}
+
+void AppendEscapedStringTo(std::string* str, const Slice& value) {
+  for (size_t i = 0; i < value.size(); i++) {
+    char c = value[i];
+    if (c >= ' ' && c <= '~') {
+      str->push_back(c);
+    } else {
+      char buf[10];
+      snprintf(buf, sizeof(buf), "\\x%02x",
+               static_cast<unsigned int>(c) & 0xff);
+      str->append(buf);
+    }
+  }
+}
+
+std::string NumberToString(uint64_t num) {
+  std::string r;
+  AppendNumberTo(&r, num);
+  return r;
+}
+
+std::string NumberToHumanString(int64_t num) {
+  char buf[19];
+  int64_t absnum = num < 0 ? -num : num;
+  if (absnum < 10000) {
+    snprintf(buf, sizeof(buf), "%" PRIi64, num);
+  } else if (absnum < 10000000) {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000);
+  } else if (absnum < 10000000000LL) {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000);
+  } else {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000);
+  }
+  return std::string(buf);
+}
+
+std::string BytesToHumanString(uint64_t bytes) {
+  const char* size_name[] = {"KB", "MB", "GB", "TB"};
+  double final_size = static_cast<double>(bytes);
+  size_t size_idx;
+
+  // always start with KB
+  final_size /= 1024;
+  size_idx = 0;
+
+  while (size_idx < 3 && final_size >= 1024) {
+    final_size /= 1024;
+    size_idx++;
+  }
+
+  char buf[20];
+  snprintf(buf, sizeof(buf), "%.2f %s", final_size, size_name[size_idx]);
+  return std::string(buf);
+}
+
+std::string EscapeString(const Slice& value) {
+  std::string r;
+  AppendEscapedStringTo(&r, value);
+  return r;
+}
+
+bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
+  uint64_t v = 0;
+  int digits = 0;
+  while (!in->empty()) {
+    char c = (*in)[0];
+    if (c >= '0' && c <= '9') {
+      ++digits;
+      const unsigned int delta = (c - '0');
+      static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
+      if (v > kMaxUint64 / 10 ||
+          (v == kMaxUint64 / 10 && delta > kMaxUint64 % 10)) {
+        // Overflow
+        return false;
+      }
+      v = (v * 10) + delta;
+      in->remove_prefix(1);
+    } else {
+      break;
+    }
+  }
+  *val = v;
+  return (digits > 0);
+}
+
+bool isSpecialChar(const char c) {
+  if (c == '\\' || c == '#' || c == ':' || c == '\r' || c == '\n') {
+    return true;
+  }
+  return false;
+}
+
+namespace {
+using CharMap = std::pair<char, char>;
+}
+
+char UnescapeChar(const char c) {
+  static const CharMap convert_map[] = {{'r', '\r'}, {'n', '\n'}};
+
+  auto iter = std::find_if(std::begin(convert_map), std::end(convert_map),
+                           [c](const CharMap& p) { return p.first == c; });
+
+  if (iter == std::end(convert_map)) {
+    return c;
+  }
+  return iter->second;
+}
+
+char EscapeChar(const char c) {
+  static const CharMap convert_map[] = {{'\n', 'n'}, {'\r', 'r'}};
+
+  auto iter = std::find_if(std::begin(convert_map), std::end(convert_map),
+                           [c](const CharMap& p) { return p.first == c; });
+
+  if (iter == std::end(convert_map)) {
+    return c;
+  }
+  return iter->second;
+}
+
+std::string EscapeOptionString(const std::string& raw_string) {
+  std::string output;
+  for (auto c : raw_string) {
+    if (isSpecialChar(c)) {
+      output += '\\';
+      output += EscapeChar(c);
+    } else {
+      output += c;
+    }
+  }
+
+  return output;
+}
+
+std::string UnescapeOptionString(const std::string& escaped_string) {
+  bool escaped = false;
+  std::string output;
+
+  for (auto c : escaped_string) {
+    if (escaped) {
+      output += UnescapeChar(c);
+      escaped = false;
+    } else {
+      if (c == '\\') {
+        escaped = true;
+        continue;
+      }
+      output += c;
+    }
+  }
+  return output;
+}
+
+std::string trim(const std::string& str) {
+  if (str.empty()) return std::string();
+  size_t start = 0;
+  size_t end = str.size() - 1;
+  while (isspace(str[start]) != 0 && start < end) {
+    ++start;
+  }
+  while (isspace(str[end]) != 0 && start < end) {
+    --end;
+  }
+  if (start <= end) {
+    return str.substr(start, end - start + 1);
+  }
+  return std::string();
+}
+
+#ifndef ROCKSDB_LITE
+
+bool ParseBoolean(const std::string& type, const std::string& value) {
+  if (value == "true" || value == "1") {
+    return true;
+  } else if (value == "false" || value == "0") {
+    return false;
+  }
+  throw std::invalid_argument(type);
+}
+
+uint32_t ParseUint32(const std::string& value) {
+  uint64_t num = ParseUint64(value);
+  if ((num >> 32LL) == 0) {
+    return static_cast<uint32_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
+int32_t ParseInt32(const std::string& value) {
+  int64_t num = ParseInt64(value);
+  if (num <= port::kMaxInt32 && num >= port::kMinInt32) {
+    return static_cast<int32_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
+#endif
+
+uint64_t ParseUint64(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  uint64_t num = std::stoull(value.c_str(), &endchar);
+#else
+  char* endptr;
+  uint64_t num = std::strtoul(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10LL;
+    else if (c == 'm' || c == 'M')
+      num <<= 20LL;
+    else if (c == 'g' || c == 'G')
+      num <<= 30LL;
+    else if (c == 't' || c == 'T')
+      num <<= 40LL;
+  }
+
+  return num;
+}
+
+int64_t ParseInt64(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  int64_t num = std::stoll(value.c_str(), &endchar);
+#else
+  char* endptr;
+  int64_t num = std::strtoll(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10LL;
+    else if (c == 'm' || c == 'M')
+      num <<= 20LL;
+    else if (c == 'g' || c == 'G')
+      num <<= 30LL;
+    else if (c == 't' || c == 'T')
+      num <<= 40LL;
+  }
+
+  return num;
+}
+
+int ParseInt(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  int num = std::stoi(value.c_str(), &endchar);
+#else
+  char* endptr;
+  int num = std::strtoul(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10;
+    else if (c == 'm' || c == 'M')
+      num <<= 20;
+    else if (c == 'g' || c == 'G')
+      num <<= 30;
+  }
+
+  return num;
+}
+
+double ParseDouble(const std::string& value) {
+#ifndef CYGWIN
+  return std::stod(value);
+#else
+  return std::strtod(value.c_str(), 0);
+#endif
+}
+
+size_t ParseSizeT(const std::string& value) {
+  return static_cast<size_t>(ParseUint64(value));
+}
+
+std::vector<int> ParseVectorInt(const std::string& value) {
+  std::vector<int> result;
+  size_t start = 0;
+  while (start < value.size()) {
+    size_t end = value.find(':', start);
+    if (end == std::string::npos) {
+      result.push_back(ParseInt(value.substr(start)));
+      break;
+    } else {
+      result.push_back(ParseInt(value.substr(start, end - start)));
+      start = end + 1;
+    }
+  }
+  return result;
+}
+
+bool SerializeIntVector(const std::vector<int>& vec, std::string* value) {
+  *value = "";
+  for (size_t i = 0; i < vec.size(); ++i) {
+    if (i > 0) {
+      *value += ":";
+    }
+    *value += ToString(vec[i]);
+  }
+  return true;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/string_util.h b/src/rocksdb/util/string_util.h
new file mode 100644
index 00000000..6e125ddf
--- /dev/null
+++ b/src/rocksdb/util/string_util.h
@@ -0,0 +1,133 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace rocksdb {
+
+class Slice;
+
+extern std::vector<std::string> StringSplit(const std::string& arg, char delim);
+
+template <typename T>
+inline std::string ToString(T value) {
+#if !(defined OS_ANDROID) && !(defined CYGWIN) && !(defined OS_FREEBSD)
+  return std::to_string(value);
+#else
+  // Andorid or cygwin doesn't support all of C++11, std::to_string() being
+  // one of the not supported features.
+  std::ostringstream os;
+  os << value;
+  return os.str();
+#endif
+}
+
+// Append a human-readable printout of "num" to *str
+extern void AppendNumberTo(std::string* str, uint64_t num);
+
+// Append a human-readable printout of "value" to *str.
+// Escapes any non-printable characters found in "value".
+extern void AppendEscapedStringTo(std::string* str, const Slice& value);
+
+// Return a string printout of "num"
+extern std::string NumberToString(uint64_t num);
+
+// Return a human-readable version of num.
+// for num >= 10.000, prints "xxK"
+// for num >= 10.000.000, prints "xxM"
+// for num >= 10.000.000.000, prints "xxG"
+extern std::string NumberToHumanString(int64_t num);
+
+// Return a human-readable version of bytes
+// ex: 1048576 -> 1.00 GB
+extern std::string BytesToHumanString(uint64_t bytes);
+
+// Append a human-readable time in micros.
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format);
+
+// Append a human-readable size in bytes
+int AppendHumanBytes(uint64_t bytes, char* output, int len);
+
+// Return a human-readable version of "value".
+// Escapes any non-printable characters found in "value".
+extern std::string EscapeString(const Slice& value);
+
+// Parse a human-readable number from "*in" into *value.  On success,
+// advances "*in" past the consumed number and sets "*val" to the
+// numeric value.  Otherwise, returns false and leaves *in in an
+// unspecified state.
+extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val);
+
+// Returns true if the input char "c" is considered as a special character
+// that will be escaped when EscapeOptionString() is called.
+//
+// @param c the input char
+// @return true if the input char "c" is considered as a special character.
+// @see EscapeOptionString
+bool isSpecialChar(const char c);
+
+// If the input char is an escaped char, it will return the its
+// associated raw-char.  Otherwise, the function will simply return
+// the original input char.
+char UnescapeChar(const char c);
+
+// If the input char is a control char, it will return the its
+// associated escaped char.  Otherwise, the function will simply return
+// the original input char.
+char EscapeChar(const char c);
+
+// Converts a raw string to an escaped string.  Escaped-characters are
+// defined via the isSpecialChar() function.  When a char in the input
+// string "raw_string" is classified as a special characters, then it
+// will be prefixed by '\' in the output.
+//
+// It's inverse function is UnescapeOptionString().
+// @param raw_string the input string
+// @return the '\' escaped string of the input "raw_string"
+// @see isSpecialChar, UnescapeOptionString
+std::string EscapeOptionString(const std::string& raw_string);
+
+// The inverse function of EscapeOptionString.  It converts
+// an '\' escaped string back to a raw string.
+//
+// @param escaped_string the input '\' escaped string
+// @return the raw string of the input "escaped_string"
+std::string UnescapeOptionString(const std::string& escaped_string);
+
+std::string trim(const std::string& str);
+
+#ifndef ROCKSDB_LITE
+bool ParseBoolean(const std::string& type, const std::string& value);
+
+uint32_t ParseUint32(const std::string& value);
+
+int32_t ParseInt32(const std::string& value);
+#endif
+
+uint64_t ParseUint64(const std::string& value);
+
+int ParseInt(const std::string& value);
+
+
+int64_t ParseInt64(const std::string& value);
+
+double ParseDouble(const std::string& value);
+
+size_t ParseSizeT(const std::string& value);
+
+std::vector<int> ParseVectorInt(const std::string& value);
+
+bool SerializeIntVector(const std::vector<int>& vec, std::string* value);
+
+extern const std::string kNullptrString;
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/sync_point.cc b/src/rocksdb/util/sync_point.cc
new file mode 100644
index 00000000..4599c256
--- /dev/null
+++ b/src/rocksdb/util/sync_point.cc
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/sync_point.h"
+#include "util/sync_point_impl.h"
+
+int rocksdb_kill_odds = 0;
+std::vector<std::string> rocksdb_kill_prefix_blacklist;
+
+#ifndef NDEBUG
+namespace rocksdb {
+
+SyncPoint* SyncPoint::GetInstance() {
+  static SyncPoint sync_point;
+  return &sync_point;
+}
+
+SyncPoint::SyncPoint() : impl_(new Data) {}
+
+SyncPoint:: ~SyncPoint() {
+  delete impl_;
+}
+
+void SyncPoint::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
+  impl_->LoadDependency(dependencies);
+}
+
+void SyncPoint::LoadDependencyAndMarkers(
+  const std::vector<SyncPointPair>& dependencies,
+  const std::vector<SyncPointPair>& markers) {
+  impl_->LoadDependencyAndMarkers(dependencies, markers);
+}
+
+void SyncPoint::SetCallBack(const std::string& point,
+  const std::function<void(void*)>& callback) {
+  impl_->SetCallBack(point, callback);
+}
+
+void SyncPoint::ClearCallBack(const std::string& point) {
+  impl_->ClearCallBack(point);
+}
+
+void SyncPoint::ClearAllCallBacks() {
+  impl_->ClearAllCallBacks();
+}
+
+void SyncPoint::EnableProcessing() {
+  impl_->EnableProcessing();
+}
+
+void SyncPoint::DisableProcessing() {
+  impl_->DisableProcessing();
+}
+
+void SyncPoint::ClearTrace() {
+  impl_->ClearTrace();
+}
+
+void SyncPoint::Process(const std::string& point, void* cb_arg) {
+  impl_->Process(point, cb_arg);
+}
+
+}  // namespace rocksdb
+#endif  // NDEBUG
diff --git a/src/rocksdb/util/sync_point.h b/src/rocksdb/util/sync_point.h
new file mode 100644
index 00000000..cb4b1e71
--- /dev/null
+++ b/src/rocksdb/util/sync_point.h
@@ -0,0 +1,140 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <assert.h>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+// This is only set from db_stress.cc and for testing only.
+// If non-zero, kill at various points in source code with probability 1/this
+extern int rocksdb_kill_odds;
+// If kill point has a prefix on this list, will skip killing.
+extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
+
+#ifdef NDEBUG
+// empty in release build
+#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds)
+#else
+
+namespace rocksdb {
+// Kill the process with probability 1/odds for testing.
+extern void TestKillRandom(std::string kill_point, int odds,
+                           const std::string& srcfile, int srcline);
+
+// To avoid crashing always at some frequently executed codepaths (during
+// kill random test), use this factor to reduce odds
+#define REDUCE_ODDS 2
+#define REDUCE_ODDS2 4
+
+#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds)                  \
+  {                                                                      \
+    if (rocksdb_kill_odds > 0) {                                         \
+      TestKillRandom(kill_point, rocksdb_kill_odds, __FILE__, __LINE__); \
+    }                                                                    \
+  }
+}  // namespace rocksdb
+#endif
+
+#ifdef NDEBUG
+#define TEST_SYNC_POINT(x)
+#define TEST_IDX_SYNC_POINT(x, index)
+#define TEST_SYNC_POINT_CALLBACK(x, y)
+#define INIT_SYNC_POINT_SINGLETONS()
+#else
+
+namespace rocksdb {
+
+// This class provides facility to reproduce race conditions deterministically
+// in unit tests.
+// Developer could specify sync points in the codebase via TEST_SYNC_POINT.
+// Each sync point represents a position in the execution stream of a thread.
+// In the unit test, 'Happens After' relationship among sync points could be
+// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of
+// threads execution.
+// Refer to (DBTest,TransactionLogIteratorRace), for an example use case.
+
+class SyncPoint {
+ public:
+  static SyncPoint* GetInstance();
+
+  SyncPoint(const SyncPoint&) = delete;
+  SyncPoint& operator=(const SyncPoint&) = delete;
+  ~SyncPoint();
+
+  struct SyncPointPair {
+    std::string predecessor;
+    std::string successor;
+  };
+
+  // call once at the beginning of a test to setup the dependency between
+  // sync points
+  void LoadDependency(const std::vector<SyncPointPair>& dependencies);
+
+  // call once at the beginning of a test to setup the dependency between
+  // sync points and setup markers indicating the successor is only enabled
+  // when it is processed on the same thread as the predecessor.
+  // When adding a marker, it implicitly adds a dependency for the marker pair.
+  void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies,
+                                const std::vector<SyncPointPair>& markers);
+
+  // The argument to the callback is passed through from
+  // TEST_SYNC_POINT_CALLBACK(); nullptr if TEST_SYNC_POINT or
+  // TEST_IDX_SYNC_POINT was used.
+  void SetCallBack(const std::string& point,
+                   const std::function<void(void*)>& callback);
+
+  // Clear callback function by point
+  void ClearCallBack(const std::string& point);
+
+  // Clear all call back functions.
+  void ClearAllCallBacks();
+
+  // enable sync point processing (disabled on startup)
+  void EnableProcessing();
+
+  // disable sync point processing
+  void DisableProcessing();
+
+  // remove the execution trace of all sync points
+  void ClearTrace();
+
+  // triggered by TEST_SYNC_POINT, blocking execution until all predecessors
+  // are executed.
+  // And/or call registered callback function, with argument `cb_arg`
+  void Process(const std::string& point, void* cb_arg = nullptr);
+
+  // TODO: it might be useful to provide a function that blocks until all
+  // sync points are cleared.
+
+  // We want this to be public so we can
+  // subclass the implementation
+  struct Data;
+
+ private:
+   // Singleton
+  SyncPoint();
+  Data*  impl_;
+};
+
+}  // namespace rocksdb
+
+// Use TEST_SYNC_POINT to specify sync points inside code base.
+// Sync points can have happens-after depedency on other sync points,
+// configured at runtime via SyncPoint::LoadDependency. This could be
+// utilized to re-produce race conditions between threads.
+// See TransactionLogIteratorRace in db_test.cc for an example use case.
+// TEST_SYNC_POINT is no op in release build.
+#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x)
+#define TEST_IDX_SYNC_POINT(x, index) \
+  rocksdb::SyncPoint::GetInstance()->Process(x + std::to_string(index))
+#define TEST_SYNC_POINT_CALLBACK(x, y) \
+  rocksdb::SyncPoint::GetInstance()->Process(x, y)
+#define INIT_SYNC_POINT_SINGLETONS() \
+  (void)rocksdb::SyncPoint::GetInstance();
+#endif  // NDEBUG
diff --git a/src/rocksdb/util/sync_point_impl.cc b/src/rocksdb/util/sync_point_impl.cc
new file mode 100644
index 00000000..248c381a
--- /dev/null
+++ b/src/rocksdb/util/sync_point_impl.cc
@@ -0,0 +1,129 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/sync_point_impl.h"
+
+#ifndef NDEBUG
+namespace rocksdb {
+
+void TestKillRandom(std::string kill_point, int odds,
+                    const std::string& srcfile, int srcline) {
+  for (auto& p : rocksdb_kill_prefix_blacklist) {
+    if (kill_point.substr(0, p.length()) == p) {
+      return;
+    }
+  }
+
+  assert(odds > 0);
+  if (odds % 7 == 0) {
+    // class Random uses multiplier 16807, which is 7^5. If odds are
+    // multiplier of 7, there might be limited values generated.
+    odds++;
+  }
+  auto* r = Random::GetTLSInstance();
+  bool crash = r->OneIn(odds);
+  if (crash) {
+    port::Crash(srcfile, srcline);
+  }
+}
+
+
+void SyncPoint::Data::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  successors_.clear();
+  predecessors_.clear();
+  cleared_points_.clear();
+  for (const auto& dependency : dependencies) {
+    successors_[dependency.predecessor].push_back(dependency.successor);
+    predecessors_[dependency.successor].push_back(dependency.predecessor);
+  }
+  cv_.notify_all();
+}
+
+void SyncPoint::Data::LoadDependencyAndMarkers(
+  const std::vector<SyncPointPair>& dependencies,
+  const std::vector<SyncPointPair>& markers) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  successors_.clear();
+  predecessors_.clear();
+  cleared_points_.clear();
+  markers_.clear();
+  marked_thread_id_.clear();
+  for (const auto& dependency : dependencies) {
+    successors_[dependency.predecessor].push_back(dependency.successor);
+    predecessors_[dependency.successor].push_back(dependency.predecessor);
+  }
+  for (const auto& marker : markers) {
+    successors_[marker.predecessor].push_back(marker.successor);
+    predecessors_[marker.successor].push_back(marker.predecessor);
+    markers_[marker.predecessor].push_back(marker.successor);
+  }
+  cv_.notify_all();
+}
+
+bool SyncPoint::Data::PredecessorsAllCleared(const std::string& point) {
+  for (const auto& pred : predecessors_[point]) {
+    if (cleared_points_.count(pred) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void SyncPoint::Data::ClearCallBack(const std::string& point) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  while (num_callbacks_running_ > 0) {
+    cv_.wait(lock);
+  }
+  callbacks_.erase(point);
+}
+
+void SyncPoint::Data::ClearAllCallBacks() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  while (num_callbacks_running_ > 0) {
+    cv_.wait(lock);
+  }
+  callbacks_.clear();
+}
+
+void SyncPoint::Data::Process(const std::string& point, void* cb_arg) {
+  if (!enabled_) {
+    return;
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto thread_id = std::this_thread::get_id();
+
+  auto marker_iter = markers_.find(point);
+  if (marker_iter != markers_.end()) {
+    for (auto& marked_point : marker_iter->second) {
+      marked_thread_id_.emplace(marked_point, thread_id);
+    }
+  }
+
+  if (DisabledByMarker(point, thread_id)) {
+    return;
+  }
+
+  while (!PredecessorsAllCleared(point)) {
+    cv_.wait(lock);
+    if (DisabledByMarker(point, thread_id)) {
+      return;
+    }
+  }
+
+  auto callback_pair = callbacks_.find(point);
+  if (callback_pair != callbacks_.end()) {
+    num_callbacks_running_++;
+    mutex_.unlock();
+    callback_pair->second(cb_arg);
+    mutex_.lock();
+    num_callbacks_running_--;
+  }
+  cleared_points_.insert(point);
+  cv_.notify_all();
+}
+} // rocksdb
+#endif
diff --git a/src/rocksdb/util/sync_point_impl.h b/src/rocksdb/util/sync_point_impl.h
new file mode 100644
index 00000000..3c7e7049
--- /dev/null
+++ b/src/rocksdb/util/sync_point_impl.h
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/sync_point.h"
+
+#include <assert.h>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "port/port.h"
+#include "util/random.h"
+
+#pragma once
+
+#ifndef NDEBUG
+namespace rocksdb {
+struct SyncPoint::Data {
+  Data() : enabled_(false) {}
+  // Enable proper deletion by subclasses
+  virtual ~Data() {}
+  // successor/predecessor map loaded from LoadDependency
+  std::unordered_map<std::string, std::vector<std::string>> successors_;
+  std::unordered_map<std::string, std::vector<std::string>> predecessors_;
+  std::unordered_map<std::string, std::function<void(void*)> > callbacks_;
+  std::unordered_map<std::string, std::vector<std::string> > markers_;
+  std::unordered_map<std::string, std::thread::id> marked_thread_id_;
+
+  std::mutex              mutex_;
+  std::condition_variable cv_;
+  // sync points that have been passed through
+  std::unordered_set<std::string> cleared_points_;
+  std::atomic<bool> enabled_;
+  int num_callbacks_running_ = 0;
+
+  void LoadDependency(const std::vector<SyncPointPair>& dependencies);
+  void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies,
+    const std::vector<SyncPointPair>& markers);
+  bool PredecessorsAllCleared(const std::string& point);
+  void SetCallBack(const std::string& point,
+    const std::function<void(void*)>& callback) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  callbacks_[point] = callback;
+}
+
+  void ClearCallBack(const std::string& point);
+  void ClearAllCallBacks();
+  void EnableProcessing() {
+    enabled_ = true;
+  }
+  void DisableProcessing() {
+    enabled_ = false;
+  }
+  void ClearTrace() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cleared_points_.clear();
+  }
+  bool DisabledByMarker(const std::string& point,
+                        std::thread::id thread_id) {
+    auto marked_point_iter = marked_thread_id_.find(point);
+    return marked_point_iter != marked_thread_id_.end() &&
+           thread_id != marked_point_iter->second;
+  }
+  void Process(const std::string& point, void* cb_arg);
+};
+}
+#endif // NDEBUG
diff --git a/src/rocksdb/util/testharness.cc b/src/rocksdb/util/testharness.cc
new file mode 100644
index 00000000..8f5eb2a4
--- /dev/null
+++ b/src/rocksdb/util/testharness.cc
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testharness.h"
+#include <string>
+#include <thread>
+
+namespace rocksdb {
+namespace test {
+
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) {
+  if (s.ok()) {
+    return ::testing::AssertionSuccess();
+  } else {
+    return ::testing::AssertionFailure() << s_expr << std::endl
+                                         << s.ToString();
+  }
+}
+
+std::string TmpDir(Env* env) {
+  std::string dir;
+  Status s = env->GetTestDirectory(&dir);
+  EXPECT_TRUE(s.ok()) << s.ToString();
+  return dir;
+}
+
+std::string PerThreadDBPath(std::string dir, std::string name) {
+  size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+  return dir + "/" + name + "_" + std::to_string(tid);
+}
+
+std::string PerThreadDBPath(std::string name) {
+  return PerThreadDBPath(test::TmpDir(), name);
+}
+
+std::string PerThreadDBPath(Env* env, std::string name) {
+  return PerThreadDBPath(test::TmpDir(env), name);
+}
+
+int RandomSeed() {
+  const char* env = getenv("TEST_RANDOM_SEED");
+  int result = (env != nullptr ? atoi(env) : 301);
+  if (result <= 0) {
+    result = 301;
+  }
+  return result;
+}
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/testharness.h b/src/rocksdb/util/testharness.h
new file mode 100644
index 00000000..39e77f8a
--- /dev/null
+++ b/src/rocksdb/util/testharness.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifdef OS_AIX
+#include "gtest/gtest.h"
+#else
+#include <gtest/gtest.h>
+#endif
+
+#include <string>
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+namespace test {
+
+// Return the directory to use for temporary storage.
+std::string TmpDir(Env* env = Env::Default());
+
+// A path unique within the thread
+std::string PerThreadDBPath(std::string name);
+std::string PerThreadDBPath(Env* env, std::string name);
+std::string PerThreadDBPath(std::string dir, std::string name);
+
+// Return a randomization seed for this run.  Typically returns the
+// same number on repeated invocations of this binary, but automated
+// runs may be able to vary the seed.
+int RandomSeed();
+
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s);
+
+#define ASSERT_OK(s) ASSERT_PRED_FORMAT1(rocksdb::test::AssertStatus, s)
+#define ASSERT_NOK(s) ASSERT_FALSE((s).ok())
+#define EXPECT_OK(s) EXPECT_PRED_FORMAT1(rocksdb::test::AssertStatus, s)
+#define EXPECT_NOK(s) EXPECT_FALSE((s).ok())
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/testutil.cc b/src/rocksdb/util/testutil.cc
new file mode 100644
index 00000000..ec95d107
--- /dev/null
+++ b/src/rocksdb/util/testutil.cc
@@ -0,0 +1,417 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testutil.h"
+
+#include <cctype>
+#include <sstream>
+
+#include "db/memtable_list.h"
+#include "port/port.h"
+#include "util/file_reader_writer.h"
+
+namespace rocksdb {
+namespace test {
+
+const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
+const uint32_t kLatestFormatVersion = 4u;
+
+Slice RandomString(Random* rnd, int len, std::string* dst) {
+  dst->resize(len);
+  for (int i = 0; i < len; i++) {
+    (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));  // ' ' .. '~'
+  }
+  return Slice(*dst);
+}
+
+extern std::string RandomHumanReadableString(Random* rnd, int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; ++i) {
+    ret[i] = static_cast<char>('a' + rnd->Uniform(26));
+  }
+  return ret;
+}
+
+std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
+  // Make sure to generate a wide variety of characters so we
+  // test the boundary conditions for short-key optimizations.
+  static const char kTestChars[] = {'\0', '\1', 'a',    'b',    'c',
+                                    'd',  'e',  '\xfd', '\xfe', '\xff'};
+  std::string result;
+  for (int i = 0; i < len; i++) {
+    std::size_t indx = 0;
+    switch (type) {
+      case RandomKeyType::RANDOM:
+        indx = rnd->Uniform(sizeof(kTestChars));
+        break;
+      case RandomKeyType::LARGEST:
+        indx = sizeof(kTestChars) - 1;
+        break;
+      case RandomKeyType::MIDDLE:
+        indx = sizeof(kTestChars) / 2;
+        break;
+      case RandomKeyType::SMALLEST:
+        indx = 0;
+        break;
+    }
+    result += kTestChars[indx];
+  }
+  return result;
+}
+
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst) {
+  int raw = static_cast<int>(len * compressed_fraction);
+  if (raw < 1) raw = 1;
+  std::string raw_data;
+  RandomString(rnd, raw, &raw_data);
+
+  // Duplicate the random data until we have filled "len" bytes
+  dst->clear();
+  while (dst->size() < (unsigned int)len) {
+    dst->append(raw_data);
+  }
+  dst->resize(len);
+  return Slice(*dst);
+}
+
+namespace {
+class Uint64ComparatorImpl : public Comparator {
+ public:
+  Uint64ComparatorImpl() {}
+
+  const char* Name() const override { return "rocksdb.Uint64Comparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t));
+    const uint64_t* left = reinterpret_cast<const uint64_t*>(a.data());
+    const uint64_t* right = reinterpret_cast<const uint64_t*>(b.data());
+    uint64_t leftValue;
+    uint64_t rightValue;
+    GetUnaligned(left, &leftValue);
+    GetUnaligned(right, &rightValue);
+    if (leftValue == rightValue) {
+      return 0;
+    } else if (leftValue < rightValue) {
+      return -1;
+    } else {
+      return 1;
+    }
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {
+    return;
+  }
+
+  void FindShortSuccessor(std::string* /*key*/) const override { return; }
+};
+}  // namespace
+
+const Comparator* Uint64Comparator() {
+  static Uint64ComparatorImpl uint64comp;
+  return &uint64comp;
+}
+
+WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
+                                          const std::string& fname) {
+  std::unique_ptr<WritableFile> file(wf);
+  return new WritableFileWriter(std::move(file), fname, EnvOptions());
+}
+
+RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
+  std::unique_ptr<RandomAccessFile> file(raf);
+  return new RandomAccessFileReader(std::move(file),
+                                    "[test RandomAccessFileReader]");
+}
+
+SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
+                                              const std::string& fname) {
+  std::unique_ptr<SequentialFile> file(se);
+  return new SequentialFileReader(std::move(file), fname);
+}
+
+void CorruptKeyType(InternalKey* ikey) {
+  std::string keystr = ikey->Encode().ToString();
+  keystr[keystr.size() - 8] = kTypeLogData;
+  ikey->DecodeFrom(Slice(keystr.data(), keystr.size()));
+}
+
+std::string KeyStr(const std::string& user_key, const SequenceNumber& seq,
+                   const ValueType& t, bool corrupt) {
+  InternalKey k(user_key, seq, t);
+  if (corrupt) {
+    CorruptKeyType(&k);
+  }
+  return k.Encode().ToString();
+}
+
+std::string RandomName(Random* rnd, const size_t len) {
+  std::stringstream ss;
+  for (size_t i = 0; i < len; ++i) {
+    ss << static_cast<char>(rnd->Uniform(26) + 'a');
+  }
+  return ss.str();
+}
+
+CompressionType RandomCompressionType(Random* rnd) {
+  return static_cast<CompressionType>(rnd->Uniform(6));
+}
+
+void RandomCompressionTypeVector(const size_t count,
+                                 std::vector<CompressionType>* types,
+                                 Random* rnd) {
+  types->clear();
+  for (size_t i = 0; i < count; ++i) {
+    types->emplace_back(RandomCompressionType(rnd));
+  }
+}
+
+const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined) {
+  int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
+  switch (random_num) {
+    case 0:
+      return NewFixedPrefixTransform(rnd->Uniform(20) + 1);
+    case 1:
+      return NewCappedPrefixTransform(rnd->Uniform(20) + 1);
+    case 2:
+      return NewNoopTransform();
+    default:
+      return nullptr;
+  }
+}
+
+BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
+  BlockBasedTableOptions opt;
+  opt.cache_index_and_filter_blocks = rnd->Uniform(2);
+  opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
+  opt.pin_top_level_index_and_filter = rnd->Uniform(2);
+  opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch
+                                   : BlockBasedTableOptions::kHashSearch;
+  opt.hash_index_allow_collision = rnd->Uniform(2);
+  opt.checksum = static_cast<ChecksumType>(rnd->Uniform(3));
+  opt.block_size = rnd->Uniform(10000000);
+  opt.block_size_deviation = rnd->Uniform(100);
+  opt.block_restart_interval = rnd->Uniform(100);
+  opt.index_block_restart_interval = rnd->Uniform(100);
+  opt.whole_key_filtering = rnd->Uniform(2);
+
+  return opt;
+}
+
+TableFactory* RandomTableFactory(Random* rnd, int pre_defined) {
+#ifndef ROCKSDB_LITE
+  int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
+  switch (random_num) {
+    case 0:
+      return NewPlainTableFactory();
+    case 1:
+      return NewCuckooTableFactory();
+    default:
+      return NewBlockBasedTableFactory();
+  }
+#else
+  (void)rnd;
+  (void)pre_defined;
+  return NewBlockBasedTableFactory();
+#endif  // !ROCKSDB_LITE
+}
+
+MergeOperator* RandomMergeOperator(Random* rnd) {
+  return new ChanglingMergeOperator(RandomName(rnd, 10));
+}
+
+CompactionFilter* RandomCompactionFilter(Random* rnd) {
+  return new ChanglingCompactionFilter(RandomName(rnd, 10));
+}
+
+CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd) {
+  return new ChanglingCompactionFilterFactory(RandomName(rnd, 10));
+}
+
+void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
+  // boolean options
+  db_opt->advise_random_on_open = rnd->Uniform(2);
+  db_opt->allow_mmap_reads = rnd->Uniform(2);
+  db_opt->allow_mmap_writes = rnd->Uniform(2);
+  db_opt->use_direct_reads = rnd->Uniform(2);
+  db_opt->use_direct_io_for_flush_and_compaction = rnd->Uniform(2);
+  db_opt->create_if_missing = rnd->Uniform(2);
+  db_opt->create_missing_column_families = rnd->Uniform(2);
+  db_opt->enable_thread_tracking = rnd->Uniform(2);
+  db_opt->error_if_exists = rnd->Uniform(2);
+  db_opt->is_fd_close_on_exec = rnd->Uniform(2);
+  db_opt->paranoid_checks = rnd->Uniform(2);
+  db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
+  db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
+  db_opt->use_adaptive_mutex = rnd->Uniform(2);
+  db_opt->use_fsync = rnd->Uniform(2);
+  db_opt->recycle_log_file_num = rnd->Uniform(2);
+  db_opt->avoid_flush_during_recovery = rnd->Uniform(2);
+  db_opt->avoid_flush_during_shutdown = rnd->Uniform(2);
+
+  // int options
+  db_opt->max_background_compactions = rnd->Uniform(100);
+  db_opt->max_background_flushes = rnd->Uniform(100);
+  db_opt->max_file_opening_threads = rnd->Uniform(100);
+  db_opt->max_open_files = rnd->Uniform(100);
+  db_opt->table_cache_numshardbits = rnd->Uniform(100);
+
+  // size_t options
+  db_opt->db_write_buffer_size = rnd->Uniform(10000);
+  db_opt->keep_log_file_num = rnd->Uniform(10000);
+  db_opt->log_file_time_to_roll = rnd->Uniform(10000);
+  db_opt->manifest_preallocation_size = rnd->Uniform(10000);
+  db_opt->max_log_file_size = rnd->Uniform(10000);
+
+  // std::string options
+  db_opt->db_log_dir = "path/to/db_log_dir";
+  db_opt->wal_dir = "path/to/wal_dir";
+
+  // uint32_t options
+  db_opt->max_subcompactions = rnd->Uniform(100000);
+
+  // uint64_t options
+  static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
+  db_opt->WAL_size_limit_MB = uint_max + rnd->Uniform(100000);
+  db_opt->WAL_ttl_seconds = uint_max + rnd->Uniform(100000);
+  db_opt->bytes_per_sync = uint_max + rnd->Uniform(100000);
+  db_opt->delayed_write_rate = uint_max + rnd->Uniform(100000);
+  db_opt->delete_obsolete_files_period_micros = uint_max + rnd->Uniform(100000);
+  db_opt->max_manifest_file_size = uint_max + rnd->Uniform(100000);
+  db_opt->max_total_wal_size = uint_max + rnd->Uniform(100000);
+  db_opt->wal_bytes_per_sync = uint_max + rnd->Uniform(100000);
+
+  // unsigned int options
+  db_opt->stats_dump_period_sec = rnd->Uniform(100000);
+}
+
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
+  cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4));
+
+  // boolean options
+  cf_opt->report_bg_io_stats = rnd->Uniform(2);
+  cf_opt->disable_auto_compactions = rnd->Uniform(2);
+  cf_opt->inplace_update_support = rnd->Uniform(2);
+  cf_opt->level_compaction_dynamic_level_bytes = rnd->Uniform(2);
+  cf_opt->optimize_filters_for_hits = rnd->Uniform(2);
+  cf_opt->paranoid_file_checks = rnd->Uniform(2);
+  cf_opt->purge_redundant_kvs_while_flush = rnd->Uniform(2);
+  cf_opt->force_consistency_checks = rnd->Uniform(2);
+  cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
+  cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
+
+  // double options
+  cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
+  cf_opt->soft_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
+  cf_opt->memtable_prefix_bloom_size_ratio =
+      static_cast<double>(rnd->Uniform(10000)) / 20000.0;
+
+  // int options
+  cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);
+  cf_opt->level0_slowdown_writes_trigger = rnd->Uniform(100);
+  cf_opt->level0_stop_writes_trigger = rnd->Uniform(100);
+  cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100);
+  cf_opt->max_mem_compaction_level = rnd->Uniform(100);
+  cf_opt->max_write_buffer_number = rnd->Uniform(100);
+  cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
+  cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
+  cf_opt->num_levels = rnd->Uniform(100);
+  cf_opt->target_file_size_multiplier = rnd->Uniform(100);
+
+  // vector int options
+  cf_opt->max_bytes_for_level_multiplier_additional.resize(cf_opt->num_levels);
+  for (int i = 0; i < cf_opt->num_levels; i++) {
+    cf_opt->max_bytes_for_level_multiplier_additional[i] = rnd->Uniform(100);
+  }
+
+  // size_t options
+  cf_opt->arena_block_size = rnd->Uniform(10000);
+  cf_opt->inplace_update_num_locks = rnd->Uniform(10000);
+  cf_opt->max_successive_merges = rnd->Uniform(10000);
+  cf_opt->memtable_huge_page_size = rnd->Uniform(10000);
+  cf_opt->write_buffer_size = rnd->Uniform(10000);
+
+  // uint32_t options
+  cf_opt->bloom_locality = rnd->Uniform(10000);
+  cf_opt->max_bytes_for_level_base = rnd->Uniform(10000);
+
+  // uint64_t options
+  static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
+  cf_opt->ttl = uint_max + rnd->Uniform(10000);
+  cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
+  cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
+  cf_opt->max_compaction_bytes =
+      cf_opt->target_file_size_base * rnd->Uniform(100);
+  cf_opt->compaction_options_fifo.max_table_files_size =
+      uint_max + rnd->Uniform(10000);
+
+  // unsigned int options
+  cf_opt->rate_limit_delay_max_milliseconds = rnd->Uniform(10000);
+
+  // pointer typed options
+  cf_opt->prefix_extractor.reset(RandomSliceTransform(rnd));
+  cf_opt->table_factory.reset(RandomTableFactory(rnd));
+  cf_opt->merge_operator.reset(RandomMergeOperator(rnd));
+  if (cf_opt->compaction_filter) {
+    delete cf_opt->compaction_filter;
+  }
+  cf_opt->compaction_filter = RandomCompactionFilter(rnd);
+  cf_opt->compaction_filter_factory.reset(RandomCompactionFilterFactory(rnd));
+
+  // custom typed options
+  cf_opt->compression = RandomCompressionType(rnd);
+  RandomCompressionTypeVector(cf_opt->num_levels,
+                              &cf_opt->compression_per_level, rnd);
+}
+
+Status DestroyDir(Env* env, const std::string& dir) {
+  Status s;
+  if (env->FileExists(dir).IsNotFound()) {
+    return s;
+  }
+  std::vector<std::string> files_in_dir;
+  s = env->GetChildren(dir, &files_in_dir);
+  if (s.ok()) {
+    for (auto& file_in_dir : files_in_dir) {
+      if (file_in_dir == "." || file_in_dir == "..") {
+        continue;
+      }
+      s = env->DeleteFile(dir + "/" + file_in_dir);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    s = env->DeleteDir(dir);
+  }
+  return s;
+}
+
+bool IsDirectIOSupported(Env* env, const std::string& dir) {
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  env_options.use_direct_writes = true;
+  std::string tmp = TempFileName(dir, 999);
+  Status s;
+  {
+    std::unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(tmp, &file, env_options);
+  }
+  if (s.ok()) {
+    s = env->DeleteFile(tmp);
+  }
+  return s.ok();
+}
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/testutil.h b/src/rocksdb/util/testutil.h
new file mode 100644
index 00000000..2aab3df7
--- /dev/null
+++ b/src/rocksdb/util/testutil.h
@@ -0,0 +1,754 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_factory.h"
+#include "table/internal_iterator.h"
+#include "table/plain_table_factory.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace rocksdb {
+class SequentialFile;
+class SequentialFileReader;
+
+namespace test {
+
+extern const uint32_t kDefaultFormatVersion;
+extern const uint32_t kLatestFormatVersion;
+
+// Store in *dst a random string of length "len" and return a Slice that
+// references the generated data.
+extern Slice RandomString(Random* rnd, int len, std::string* dst);
+
+extern std::string RandomHumanReadableString(Random* rnd, int len);
+
+// Return a random key with the specified length that may contain interesting
+// characters (e.g. \x00, \xff, etc.).
+enum RandomKeyType : char { RANDOM, LARGEST, SMALLEST, MIDDLE };
+extern std::string RandomKey(Random* rnd, int len,
+                             RandomKeyType type = RandomKeyType::RANDOM);
+
+// Store in *dst a string of length "len" that will compress to
+// "N*compressed_fraction" bytes and return a Slice that references
+// the generated data.
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst);
+
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+  bool writable_file_error_;
+  int num_writable_file_errors_;
+
+  ErrorEnv() : EnvWrapper(Env::Default()),
+               writable_file_error_(false),
+               num_writable_file_errors_(0) { }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) override {
+    result->reset();
+    if (writable_file_error_) {
+      ++num_writable_file_errors_;
+      return Status::IOError(fname, "fake error");
+    }
+    return target()->NewWritableFile(fname, result, soptions);
+  }
+};
+
+#ifndef NDEBUG
+// An internal comparator that just forward comparing results from the
+// user comparator in it. Can be used to test entities that have no dependency
+// on internal key structure but consumes InternalKeyComparator, like
+// BlockBasedTable.
+class PlainInternalKeyComparator : public InternalKeyComparator {
+ public:
+  explicit PlainInternalKeyComparator(const Comparator* c)
+      : InternalKeyComparator(c) {}
+
+  virtual ~PlainInternalKeyComparator() {}
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    return user_comparator()->Compare(a, b);
+  }
+};
+#endif
+
+// A test comparator which compare two strings in this way:
+// (1) first compare prefix of 8 bytes in alphabet order,
+// (2) if two strings share the same prefix, sort the other part of the string
+//     in the reverse alphabet order.
+// This helps simulate the case of compounded key of [entity][timestamp] and
+// latest timestamp first.
+class SimpleSuffixReverseComparator : public Comparator {
+ public:
+  SimpleSuffixReverseComparator() {}
+
+  virtual const char* Name() const override {
+    return "SimpleSuffixReverseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    Slice prefix_a = Slice(a.data(), 8);
+    Slice prefix_b = Slice(b.data(), 8);
+    int prefix_comp = prefix_a.compare(prefix_b);
+    if (prefix_comp != 0) {
+      return prefix_comp;
+    } else {
+      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
+      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
+      return -(suffix_a.compare(suffix_b));
+    }
+  }
+  virtual void FindShortestSeparator(std::string* /*start*/,
+                                     const Slice& /*limit*/) const override {}
+
+  virtual void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+// Returns a user key comparator that can be used for comparing two uint64_t
+// slices. Instead of comparing slices byte-wise, it compares all the 8 bytes
+// at once. Assumes same endian-ness is used though the database's lifetime.
+// Symantics of comparison would differ from Bytewise comparator in little
+// endian machines.
+extern const Comparator* Uint64Comparator();
+
+// Iterator over a vector of keys/values
+class VectorIterator : public InternalIterator {
+ public:
+  explicit VectorIterator(const std::vector<std::string>& keys)
+      : keys_(keys), current_(keys.size()) {
+    std::sort(keys_.begin(), keys_.end());
+    values_.resize(keys.size());
+  }
+
+  VectorIterator(const std::vector<std::string>& keys,
+      const std::vector<std::string>& values)
+    : keys_(keys), values_(values), current_(keys.size()) {
+    assert(keys_.size() == values_.size());
+  }
+
+  virtual bool Valid() const override { return current_ < keys_.size(); }
+
+  virtual void SeekToFirst() override { current_ = 0; }
+  virtual void SeekToLast() override { current_ = keys_.size() - 1; }
+
+  virtual void Seek(const Slice& target) override {
+    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+               keys_.begin();
+  }
+
+  virtual void SeekForPrev(const Slice& target) override {
+    current_ = std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) -
+               keys_.begin();
+    if (!Valid()) {
+      SeekToLast();
+    } else {
+      Prev();
+    }
+  }
+
+  virtual void Next() override { current_++; }
+  virtual void Prev() override { current_--; }
+
+  virtual Slice key() const override { return Slice(keys_[current_]); }
+  virtual Slice value() const override { return Slice(values_[current_]); }
+
+  virtual Status status() const override { return Status::OK(); }
+
+  virtual bool IsKeyPinned() const override { return true; }
+  virtual bool IsValuePinned() const override { return true; }
+
+ private:
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  size_t current_;
+};
+extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
+                                                 const std::string& fname);
+
+extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf);
+
+extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
+                                                     const std::string& fname);
+
+class StringSink: public WritableFile {
+ public:
+  std::string contents_;
+
+  explicit StringSink(Slice* reader_contents = nullptr) :
+      WritableFile(),
+      contents_(""),
+      reader_contents_(reader_contents),
+      last_flush_(0) {
+    if (reader_contents_ != nullptr) {
+      *reader_contents_ = Slice(contents_.data(), 0);
+    }
+  }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Truncate(uint64_t size) override {
+    contents_.resize(static_cast<size_t>(size));
+    return Status::OK();
+  }
+  virtual Status Close() override { return Status::OK(); }
+  virtual Status Flush() override {
+    if (reader_contents_ != nullptr) {
+      assert(reader_contents_->size() <= last_flush_);
+      size_t offset = last_flush_ - reader_contents_->size();
+      *reader_contents_ = Slice(
+          contents_.data() + offset,
+          contents_.size() - offset);
+      last_flush_ = contents_.size();
+    }
+
+    return Status::OK();
+  }
+  virtual Status Sync() override { return Status::OK(); }
+  virtual Status Append(const Slice& slice) override {
+    contents_.append(slice.data(), slice.size());
+    return Status::OK();
+  }
+  void Drop(size_t bytes) {
+    if (reader_contents_ != nullptr) {
+      contents_.resize(contents_.size() - bytes);
+      *reader_contents_ = Slice(
+          reader_contents_->data(), reader_contents_->size() - bytes);
+      last_flush_ = contents_.size();
+    }
+  }
+
+ private:
+  Slice* reader_contents_;
+  size_t last_flush_;
+};
+
+// A wrapper around a StringSink to give it a RandomRWFile interface
+class RandomRWStringSink : public RandomRWFile {
+ public:
+  explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {}
+
+  Status Write(uint64_t offset, const Slice& data) override {
+    if (offset + data.size() > ss_->contents_.size()) {
+      ss_->contents_.resize(static_cast<size_t>(offset) + data.size(), '\0');
+    }
+
+    char* pos = const_cast<char*>(ss_->contents_.data() + offset);
+    memcpy(pos, data.data(), data.size());
+    return Status::OK();
+  }
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* /*scratch*/) const override {
+    *result = Slice(nullptr, 0);
+    if (offset < ss_->contents_.size()) {
+      size_t str_res_sz =
+          std::min(static_cast<size_t>(ss_->contents_.size() - offset), n);
+      *result = Slice(ss_->contents_.data() + offset, str_res_sz);
+    }
+    return Status::OK();
+  }
+
+  Status Flush() override { return Status::OK(); }
+
+  Status Sync() override { return Status::OK(); }
+
+  Status Close() override { return Status::OK(); }
+
+  const std::string& contents() const { return ss_->contents(); }
+
+ private:
+  StringSink* ss_;
+};
+
+// Like StringSink, this writes into a string.  Unlink StringSink, it
+// has some initial content and overwrites it, just like a recycled
+// log file.
+class OverwritingStringSink : public WritableFile {
+ public:
+  explicit OverwritingStringSink(Slice* reader_contents)
+      : WritableFile(),
+        contents_(""),
+        reader_contents_(reader_contents),
+        last_flush_(0) {}
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Truncate(uint64_t size) override {
+    contents_.resize(static_cast<size_t>(size));
+    return Status::OK();
+  }
+  virtual Status Close() override { return Status::OK(); }
+  virtual Status Flush() override {
+    if (last_flush_ < contents_.size()) {
+      assert(reader_contents_->size() >= contents_.size());
+      memcpy((char*)reader_contents_->data() + last_flush_,
+             contents_.data() + last_flush_, contents_.size() - last_flush_);
+      last_flush_ = contents_.size();
+    }
+    return Status::OK();
+  }
+  virtual Status Sync() override { return Status::OK(); }
+  virtual Status Append(const Slice& slice) override {
+    contents_.append(slice.data(), slice.size());
+    return Status::OK();
+  }
+  void Drop(size_t bytes) {
+    contents_.resize(contents_.size() - bytes);
+    if (last_flush_ > contents_.size()) last_flush_ = contents_.size();
+  }
+
+ private:
+  std::string contents_;
+  Slice* reader_contents_;
+  size_t last_flush_;
+};
+
+class StringSource: public RandomAccessFile {
+ public:
+  explicit StringSource(const Slice& contents, uint64_t uniq_id = 0,
+                        bool mmap = false)
+      : contents_(contents.data(), contents.size()),
+        uniq_id_(uniq_id),
+        mmap_(mmap),
+        total_reads_(0) {}
+
+  virtual ~StringSource() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+      char* scratch) const override {
+    total_reads_++;
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - static_cast<size_t>(offset);
+    }
+    if (!mmap_) {
+      memcpy(scratch, &contents_[static_cast<size_t>(offset)], n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&contents_[static_cast<size_t>(offset)], n);
+    }
+    return Status::OK();
+  }
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
+    if (max_size < 20) {
+      return 0;
+    }
+
+    char* rid = id;
+    rid = EncodeVarint64(rid, uniq_id_);
+    rid = EncodeVarint64(rid, 0);
+    return static_cast<size_t>(rid-id);
+  }
+
+  int total_reads() const { return total_reads_; }
+
+  void set_total_reads(int tr) { total_reads_ = tr; }
+
+ private:
+  std::string contents_;
+  uint64_t uniq_id_;
+  bool mmap_;
+  mutable int total_reads_;
+};
+
+class NullLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* /*format*/, va_list /*ap*/) override {}
+  virtual size_t GetLogFileSize() const override { return 0; }
+};
+
+// Corrupts key by changing the type
+extern void CorruptKeyType(InternalKey* ikey);
+
+extern std::string KeyStr(const std::string& user_key,
+                          const SequenceNumber& seq, const ValueType& t,
+                          bool corrupt = false);
+
+class SleepingBackgroundTask {
+ public:
+  SleepingBackgroundTask()
+      : bg_cv_(&mutex_),
+        should_sleep_(true),
+        done_with_sleep_(false),
+        sleeping_(false) {}
+
+  bool IsSleeping() {
+    MutexLock l(&mutex_);
+    return sleeping_;
+  }
+  void DoSleep() {
+    MutexLock l(&mutex_);
+    sleeping_ = true;
+    bg_cv_.SignalAll();
+    while (should_sleep_) {
+      bg_cv_.Wait();
+    }
+    sleeping_ = false;
+    done_with_sleep_ = true;
+    bg_cv_.SignalAll();
+  }
+  void WaitUntilSleeping() {
+    MutexLock l(&mutex_);
+    while (!sleeping_ || !should_sleep_) {
+      bg_cv_.Wait();
+    }
+  }
+  void WakeUp() {
+    MutexLock l(&mutex_);
+    should_sleep_ = false;
+    bg_cv_.SignalAll();
+  }
+  void WaitUntilDone() {
+    MutexLock l(&mutex_);
+    while (!done_with_sleep_) {
+      bg_cv_.Wait();
+    }
+  }
+  bool WokenUp() {
+    MutexLock l(&mutex_);
+    return should_sleep_ == false;
+  }
+
+  void Reset() {
+    MutexLock l(&mutex_);
+    should_sleep_ = true;
+    done_with_sleep_ = false;
+  }
+
+  static void DoSleepTask(void* arg) {
+    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  }
+
+ private:
+  port::Mutex mutex_;
+  port::CondVar bg_cv_;  // Signalled when background work finishes
+  bool should_sleep_;
+  bool done_with_sleep_;
+  bool sleeping_;
+};
+
+// Filters merge operands and values that are equal to `num`.
+class FilterNumber : public CompactionFilter {
+ public:
+  explicit FilterNumber(uint64_t num) : num_(num) {}
+
+  std::string last_merge_operand_key() { return last_merge_operand_key_; }
+
+  bool Filter(int /*level*/, const rocksdb::Slice& /*key*/,
+              const rocksdb::Slice& value, std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    if (value.size() == sizeof(uint64_t)) {
+      return num_ == DecodeFixed64(value.data());
+    }
+    return true;
+  }
+
+  bool FilterMergeOperand(int /*level*/, const rocksdb::Slice& key,
+                          const rocksdb::Slice& value) const override {
+    last_merge_operand_key_ = key.ToString();
+    if (value.size() == sizeof(uint64_t)) {
+      return num_ == DecodeFixed64(value.data());
+    }
+    return true;
+  }
+
+  const char* Name() const override { return "FilterBadMergeOperand"; }
+
+ private:
+  mutable std::string last_merge_operand_key_;
+  uint64_t num_;
+};
+
+inline std::string EncodeInt(uint64_t x) {
+  std::string result;
+  PutFixed64(&result, x);
+  return result;
+}
+
+class StringEnv : public EnvWrapper {
+ public:
+  class SeqStringSource : public SequentialFile {
+   public:
+    explicit SeqStringSource(const std::string& data)
+        : data_(data), offset_(0) {}
+    ~SeqStringSource() {}
+    Status Read(size_t n, Slice* result, char* scratch) override {
+      std::string output;
+      if (offset_ < data_.size()) {
+        n = std::min(data_.size() - offset_, n);
+        memcpy(scratch, data_.data() + offset_, n);
+        offset_ += n;
+        *result = Slice(scratch, n);
+      } else {
+        return Status::InvalidArgument(
+            "Attemp to read when it already reached eof.");
+      }
+      return Status::OK();
+    }
+    Status Skip(uint64_t n) override {
+      if (offset_ >= data_.size()) {
+        return Status::InvalidArgument(
+            "Attemp to read when it already reached eof.");
+      }
+      // TODO(yhchiang): Currently doesn't handle the overflow case.
+      offset_ += static_cast<size_t>(n);
+      return Status::OK();
+    }
+
+   private:
+    std::string data_;
+    size_t offset_;
+  };
+
+  class StringSink : public WritableFile {
+   public:
+    explicit StringSink(std::string* contents)
+        : WritableFile(), contents_(contents) {}
+    virtual Status Truncate(uint64_t size) override {
+      contents_->resize(static_cast<size_t>(size));
+      return Status::OK();
+    }
+    virtual Status Close() override { return Status::OK(); }
+    virtual Status Flush() override { return Status::OK(); }
+    virtual Status Sync() override { return Status::OK(); }
+    virtual Status Append(const Slice& slice) override {
+      contents_->append(slice.data(), slice.size());
+      return Status::OK();
+    }
+
+   private:
+    std::string* contents_;
+  };
+
+  explicit StringEnv(Env* t) : EnvWrapper(t) {}
+  virtual ~StringEnv() {}
+
+  const std::string& GetContent(const std::string& f) { return files_[f]; }
+
+  const Status WriteToNewFile(const std::string& file_name,
+                              const std::string& content) {
+    std::unique_ptr<WritableFile> r;
+    auto s = NewWritableFile(file_name, &r, EnvOptions());
+    if (!s.ok()) {
+      return s;
+    }
+    r->Append(content);
+    r->Flush();
+    r->Close();
+    assert(files_[file_name] == content);
+    return Status::OK();
+  }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& /*options*/) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return Status::NotFound("The specified file does not exist", f);
+    }
+    r->reset(new SeqStringSource(iter->second));
+    return Status::OK();
+  }
+  Status NewRandomAccessFile(const std::string& /*f*/,
+                             std::unique_ptr<RandomAccessFile>* /*r*/,
+                             const EnvOptions& /*options*/) override {
+    return Status::NotSupported();
+  }
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& /*options*/) override {
+    auto iter = files_.find(f);
+    if (iter != files_.end()) {
+      return Status::IOError("The specified file already exists", f);
+    }
+    r->reset(new StringSink(&files_[f]));
+    return Status::OK();
+  }
+  virtual Status NewDirectory(const std::string& /*name*/,
+                              std::unique_ptr<Directory>* /*result*/) override {
+    return Status::NotSupported();
+  }
+  Status FileExists(const std::string& f) override {
+    if (files_.find(f) == files_.end()) {
+      return Status::NotFound();
+    }
+    return Status::OK();
+  }
+  Status GetChildren(const std::string& /*dir*/,
+                     std::vector<std::string>* /*r*/) override {
+    return Status::NotSupported();
+  }
+  Status DeleteFile(const std::string& f) override {
+    files_.erase(f);
+    return Status::OK();
+  }
+  Status CreateDir(const std::string& /*d*/) override {
+    return Status::NotSupported();
+  }
+  Status CreateDirIfMissing(const std::string& /*d*/) override {
+    return Status::NotSupported();
+  }
+  Status DeleteDir(const std::string& /*d*/) override {
+    return Status::NotSupported();
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return Status::NotFound("The specified file does not exist:", f);
+    }
+    *s = iter->second.size();
+    return Status::OK();
+  }
+
+  Status GetFileModificationTime(const std::string& /*fname*/,
+                                 uint64_t* /*file_mtime*/) override {
+    return Status::NotSupported();
+  }
+
+  Status RenameFile(const std::string& /*s*/,
+                    const std::string& /*t*/) override {
+    return Status::NotSupported();
+  }
+
+  Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override {
+    return Status::NotSupported();
+  }
+
+  Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
+    return Status::NotSupported();
+  }
+
+  Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); }
+
+ protected:
+  std::unordered_map<std::string, std::string> files_;
+};
+
+// Randomly initialize the given DBOptions
+void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
+
+// Randomly initialize the given ColumnFamilyOptions
+// Note that the caller is responsible for releasing non-null
+// cf_opt->compaction_filter.
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd);
+
+// A dummy merge operator which can change its name
+class ChanglingMergeOperator : public MergeOperator {
+ public:
+  explicit ChanglingMergeOperator(const std::string& name)
+      : name_(name + "MergeOperator") {}
+  ~ChanglingMergeOperator() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                           MergeOperationOutput* /*merge_out*/) const override {
+    return false;
+  }
+  virtual bool PartialMergeMulti(const Slice& /*key*/,
+                                 const std::deque<Slice>& /*operand_list*/,
+                                 std::string* /*new_value*/,
+                                 Logger* /*logger*/) const override {
+    return false;
+  }
+  virtual const char* Name() const override { return name_.c_str(); }
+
+ protected:
+  std::string name_;
+};
+
+// Returns a dummy merge operator with random name.
+MergeOperator* RandomMergeOperator(Random* rnd);
+
+// A dummy compaction filter which can change its name
+class ChanglingCompactionFilter : public CompactionFilter {
+ public:
+  explicit ChanglingCompactionFilter(const std::string& name)
+      : name_(name + "CompactionFilter") {}
+  ~ChanglingCompactionFilter() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  bool Filter(int /*level*/, const Slice& /*key*/,
+              const Slice& /*existing_value*/, std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  std::string name_;
+};
+
+// Returns a dummy compaction filter with a random name.
+CompactionFilter* RandomCompactionFilter(Random* rnd);
+
+// A dummy compaction filter factory which can change its name
+class ChanglingCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChanglingCompactionFilterFactory(const std::string& name)
+      : name_(name + "CompactionFilterFactory") {}
+  ~ChanglingCompactionFilterFactory() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>();
+  }
+
+  // Returns a name that identifies this compaction filter factory.
+  const char* Name() const override { return name_.c_str(); }
+
+ protected:
+  std::string name_;
+};
+
+CompressionType RandomCompressionType(Random* rnd);
+
+void RandomCompressionTypeVector(const size_t count,
+                                 std::vector<CompressionType>* types,
+                                 Random* rnd);
+
+CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd);
+
+const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined = -1);
+
+TableFactory* RandomTableFactory(Random* rnd, int pre_defined = -1);
+
+std::string RandomName(Random* rnd, const size_t len);
+
+Status DestroyDir(Env* env, const std::string& dir);
+
+bool IsDirectIOSupported(Env* env, const std::string& dir);
+
+}  // namespace test
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_list_test.cc b/src/rocksdb/util/thread_list_test.cc
new file mode 100644
index 00000000..a4a343a9
--- /dev/null
+++ b/src/rocksdb/util/thread_list_test.cc
@@ -0,0 +1,352 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <mutex>
+#include <condition_variable>
+
+#include "monitoring/thread_status_updater.h"
+#include "rocksdb/db.h"
+#include "util/testharness.h"
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+namespace rocksdb {
+
+class SimulatedBackgroundTask {
+ public:
+  SimulatedBackgroundTask(
+      const void* db_key, const std::string& db_name,
+      const void* cf_key, const std::string& cf_name,
+      const ThreadStatus::OperationType operation_type =
+          ThreadStatus::OP_UNKNOWN,
+      const ThreadStatus::StateType state_type =
+          ThreadStatus::STATE_UNKNOWN)
+      : db_key_(db_key), db_name_(db_name),
+        cf_key_(cf_key), cf_name_(cf_name),
+        operation_type_(operation_type), state_type_(state_type),
+        should_run_(true), running_count_(0) {
+    Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo(
+        db_key_, db_name_, cf_key_, cf_name_);
+  }
+
+  ~SimulatedBackgroundTask() {
+    Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_);
+  }
+
+  void Run() {
+    std::unique_lock<std::mutex> l(mutex_);
+    running_count_++;
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadOperation(
+        operation_type_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_);
+    while (should_run_) {
+      bg_cv_.wait(l);
+    }
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadState();
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation();
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(nullptr);
+    running_count_--;
+    bg_cv_.notify_all();
+  }
+
+  void FinishAllTasks() {
+    std::unique_lock<std::mutex> l(mutex_);
+    should_run_ = false;
+    bg_cv_.notify_all();
+  }
+
+  void WaitUntilScheduled(int job_count, Env* env) {
+    while (running_count_ < job_count) {
+      env->SleepForMicroseconds(1000);
+    }
+  }
+
+  void WaitUntilDone() {
+    std::unique_lock<std::mutex> l(mutex_);
+    while (running_count_ > 0) {
+      bg_cv_.wait(l);
+    }
+  }
+
+  static void DoSimulatedTask(void* arg) {
+    reinterpret_cast<SimulatedBackgroundTask*>(arg)->Run();
+  }
+
+ private:
+  const void* db_key_;
+  const std::string db_name_;
+  const void* cf_key_;
+  const std::string cf_name_;
+  const ThreadStatus::OperationType operation_type_;
+  const ThreadStatus::StateType state_type_;
+  std::mutex mutex_;
+  std::condition_variable bg_cv_;
+  bool should_run_;
+  std::atomic<int> running_count_;
+};
+
+class ThreadListTest : public testing::Test {
+ public:
+  ThreadListTest() {
+  }
+};
+
+TEST_F(ThreadListTest, GlobalTables) {
+  // verify the global tables for operations and states are properly indexed.
+  for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) {
+    ASSERT_EQ(global_operation_table[type].type, type);
+    ASSERT_EQ(global_operation_table[type].name,
+              ThreadStatus::GetOperationName(
+                  ThreadStatus::OperationType(type)));
+  }
+
+  for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) {
+    ASSERT_EQ(global_state_table[type].type, type);
+    ASSERT_EQ(global_state_table[type].name,
+              ThreadStatus::GetStateName(
+                  ThreadStatus::StateType(type)));
+  }
+
+  for (int stage = 0; stage != ThreadStatus::NUM_OP_STAGES; ++stage) {
+    ASSERT_EQ(global_op_stage_table[stage].stage, stage);
+    ASSERT_EQ(global_op_stage_table[stage].name,
+              ThreadStatus::GetOperationStageName(
+                  ThreadStatus::OperationStage(stage)));
+  }
+}
+
+TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) {
+  Env* env = Env::Default();
+  const int kHighPriorityThreads = 3;
+  const int kLowPriorityThreads = 5;
+  const int kSimulatedHighPriThreads = kHighPriorityThreads - 1;
+  const int kSimulatedLowPriThreads = kLowPriorityThreads / 3;
+  env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH);
+  env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW);
+
+  SimulatedBackgroundTask running_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu");
+
+  for (int test = 0; test < kSimulatedHighPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &running_task, Env::Priority::HIGH);
+  }
+  for (int test = 0; test < kSimulatedLowPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &running_task, Env::Priority::LOW);
+  }
+  running_task.WaitUntilScheduled(
+      kSimulatedHighPriThreads + kSimulatedLowPriThreads, env);
+
+  std::vector<ThreadStatus> thread_list;
+
+  // Verify the number of running threads in each pool.
+  env->GetThreadList(&thread_list);
+  int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0};
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
+    }
+  }
+  ASSERT_EQ(
+      running_count[ThreadStatus::HIGH_PRIORITY],
+      kSimulatedHighPriThreads);
+  ASSERT_EQ(
+      running_count[ThreadStatus::LOW_PRIORITY],
+      kSimulatedLowPriThreads);
+  ASSERT_EQ(
+      running_count[ThreadStatus::USER], 0);
+
+  running_task.FinishAllTasks();
+  running_task.WaitUntilDone();
+
+  // Verify none of the threads are running
+  env->GetThreadList(&thread_list);
+
+  for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) {
+    running_count[i] = 0;
+  }
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
+    }
+  }
+
+  ASSERT_EQ(
+      running_count[ThreadStatus::HIGH_PRIORITY], 0);
+  ASSERT_EQ(
+      running_count[ThreadStatus::LOW_PRIORITY], 0);
+  ASSERT_EQ(
+      running_count[ThreadStatus::USER], 0);
+}
+
+namespace {
+  void UpdateStatusCounts(
+      const std::vector<ThreadStatus>& thread_list,
+      int operation_counts[], int state_counts[]) {
+    for (auto thread_status : thread_list) {
+      operation_counts[thread_status.operation_type]++;
+      state_counts[thread_status.state_type]++;
+    }
+  }
+
+  void VerifyAndResetCounts(
+      const int correct_counts[], int collected_counts[], int size) {
+    for (int i = 0; i < size; ++i) {
+      ASSERT_EQ(collected_counts[i], correct_counts[i]);
+      collected_counts[i] = 0;
+    }
+  }
+
+  void UpdateCount(
+      int operation_counts[], int from_event, int to_event, int amount) {
+    operation_counts[from_event] -= amount;
+    operation_counts[to_event] += amount;
+  }
+}  // namespace
+
+TEST_F(ThreadListTest, SimpleEventTest) {
+  Env* env = Env::Default();
+
+  // simulated tasks
+  const int kFlushWriteTasks = 3;
+  SimulatedBackgroundTask flush_write_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_FLUSH);
+
+  const int kCompactionWriteTasks = 4;
+  SimulatedBackgroundTask compaction_write_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionReadTasks = 5;
+  SimulatedBackgroundTask compaction_read_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionWaitTasks = 6;
+  SimulatedBackgroundTask compaction_wait_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  // setup right answers
+  int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  correct_operation_counts[ThreadStatus::OP_FLUSH] =
+      kFlushWriteTasks;
+  correct_operation_counts[ThreadStatus::OP_COMPACTION] =
+      kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks;
+
+  env->SetBackgroundThreads(
+      correct_operation_counts[ThreadStatus::OP_FLUSH], Env::HIGH);
+  env->SetBackgroundThreads(
+      correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW);
+
+  // schedule the simulated tasks
+  for (int t = 0; t < kFlushWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &flush_write_task, Env::Priority::HIGH);
+  }
+  flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env);
+
+  for (int t = 0; t < kCompactionWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_write_task, Env::Priority::LOW);
+  }
+  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env);
+
+  for (int t = 0; t < kCompactionReadTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_read_task, Env::Priority::LOW);
+  }
+  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env);
+
+  for (int t = 0; t < kCompactionWaitTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_wait_task, Env::Priority::LOW);
+  }
+  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env);
+
+  // verify the thread-status
+  int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0};
+
+  std::vector<ThreadStatus> thread_list;
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-wait tasks and see if the thread-status
+  // reflects this update
+  compaction_wait_task.FinishAllTasks();
+  compaction_wait_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate flush-write tasks and see if the thread-status
+  // reflects this update
+  flush_write_task.FinishAllTasks();
+  flush_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH,
+              ThreadStatus::OP_UNKNOWN, kFlushWriteTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_write_task.FinishAllTasks();
+  compaction_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_read_task.FinishAllTasks();
+  compaction_read_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionReadTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return 0;
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
diff --git a/src/rocksdb/util/thread_local.cc b/src/rocksdb/util/thread_local.cc
new file mode 100644
index 00000000..7346eff1
--- /dev/null
+++ b/src/rocksdb/util/thread_local.cc
@@ -0,0 +1,554 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/thread_local.h"
+#include "util/mutexlock.h"
+#include "port/likely.h"
+#include <stdlib.h>
+
+namespace rocksdb {
+
+struct Entry {
+  Entry() : ptr(nullptr) {}
+  Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {}
+  std::atomic<void*> ptr;
+};
+
+class StaticMeta;
+
+// This is the structure that is declared as "thread_local" storage.
+// The vector keep list of atomic pointer for all instances for "current"
+// thread. The vector is indexed by an Id that is unique in process and
+// associated with one ThreadLocalPtr instance. The Id is assigned by a
+// global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr
+// instances, each thread will have a ThreadData with a vector of size 3:
+//     ---------------------------------------------------
+//     |          | instance 1 | instance 2 | instnace 3 |
+//     ---------------------------------------------------
+//     | thread 1 |    void*   |    void*   |    void*   | <- ThreadData
+//     ---------------------------------------------------
+//     | thread 2 |    void*   |    void*   |    void*   | <- ThreadData
+//     ---------------------------------------------------
+//     | thread 3 |    void*   |    void*   |    void*   | <- ThreadData
+//     ---------------------------------------------------
+struct ThreadData {
+  explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst)
+    : entries(),
+      next(nullptr),
+      prev(nullptr),
+      inst(_inst) {}
+  std::vector<Entry> entries;
+  ThreadData* next;
+  ThreadData* prev;
+  ThreadLocalPtr::StaticMeta* inst;
+};
+
+class ThreadLocalPtr::StaticMeta {
+public:
+  StaticMeta();
+
+  // Return the next available Id
+  uint32_t GetId();
+  // Return the next available Id without claiming it
+  uint32_t PeekId() const;
+  // Return the given Id back to the free pool. This also triggers
+  // UnrefHandler for associated pointer value (if not NULL) for all threads.
+  void ReclaimId(uint32_t id);
+
+  // Return the pointer value for the given id for the current thread.
+  void* Get(uint32_t id) const;
+  // Reset the pointer value for the given id for the current thread.
+  void Reset(uint32_t id, void* ptr);
+  // Atomically swap the supplied ptr and return the previous value
+  void* Swap(uint32_t id, void* ptr);
+  // Atomically compare and swap the provided value only if it equals
+  // to expected value.
+  bool CompareAndSwap(uint32_t id, void* ptr, void*& expected);
+  // Reset all thread local data to replacement, and return non-nullptr
+  // data for all existing threads
+  void Scrape(uint32_t id, autovector<void*>* ptrs, void* const replacement);
+  // Update res by applying func on each thread-local value. Holds a lock that
+  // prevents unref handler from running during this call, but clients must
+  // still provide external synchronization since the owning thread can
+  // access the values without internal locking, e.g., via Get() and Reset().
+  void Fold(uint32_t id, FoldFunc func, void* res);
+
+  // Register the UnrefHandler for id
+  void SetHandler(uint32_t id, UnrefHandler handler);
+
+  // protect inst, next_instance_id_, free_instance_ids_, head_,
+  // ThreadData.entries
+  //
+  // Note that here we prefer function static variable instead of the usual
+  // global static variable.  The reason is that c++ destruction order of
+  // static variables in the reverse order of their construction order.
+  // However, C++ does not guarantee any construction order when global
+  // static variables are defined in different files, while the function
+  // static variables are initialized when their function are first called.
+  // As a result, the construction order of the function static variables
+  // can be controlled by properly invoke their first function calls in
+  // the right order.
+  //
+  // For instance, the following function contains a function static
+  // variable.  We place a dummy function call of this inside
+  // Env::Default() to ensure the construction order of the construction
+  // order.
+  static port::Mutex* Mutex();
+
+  // Returns the member mutex of the current StaticMeta.  In general,
+  // Mutex() should be used instead of this one.  However, in case where
+  // the static variable inside Instance() goes out of scope, MemberMutex()
+  // should be used.  One example is OnThreadExit() function.
+  port::Mutex* MemberMutex() { return &mutex_; }
+
+private:
+  // Get UnrefHandler for id with acquiring mutex
+  // REQUIRES: mutex locked
+  UnrefHandler GetHandler(uint32_t id);
+
+  // Triggered before a thread terminates
+  static void OnThreadExit(void* ptr);
+
+  // Add current thread's ThreadData to the global chain
+  // REQUIRES: mutex locked
+  void AddThreadData(ThreadData* d);
+
+  // Remove current thread's ThreadData from the global chain
+  // REQUIRES: mutex locked
+  void RemoveThreadData(ThreadData* d);
+
+  static ThreadData* GetThreadLocal();
+
+  uint32_t next_instance_id_;
+  // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed
+  // frequently. This also prevents it from blowing up the vector space.
+  autovector<uint32_t> free_instance_ids_;
+  // Chain all thread local structure together. This is necessary since
+  // when one ThreadLocalPtr gets destroyed, we need to loop over each
+  // thread's version of pointer corresponding to that instance and
+  // call UnrefHandler for it.
+  ThreadData head_;
+
+  std::unordered_map<uint32_t, UnrefHandler> handler_map_;
+
+  // The private mutex.  Developers should always use Mutex() instead of
+  // using this variable directly.
+  port::Mutex mutex_;
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+  // Thread local storage
+  static __thread ThreadData* tls_;
+#endif
+
+  // Used to make thread exit trigger possible if !defined(OS_MACOSX).
+  // Otherwise, used to retrieve thread data.
+  pthread_key_t pthread_key_;
+};
+
+
+#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+__thread ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr;
+#endif
+
+// Windows doesn't support a per-thread destructor with its
+// TLS primitives.  So, we build it manually by inserting a
+// function to be called on each thread's exit.
+// See http://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
+// and http://www.nynaeve.net/?p=183
+//
+// really we do this to have clear conscience since using TLS with thread-pools
+// is iffy
+// although OK within a request. But otherwise, threads have no identity in its
+// modern use.
+
+// This runs on windows only called from the System Loader
+#ifdef OS_WIN
+
+// Windows cleanup routine is invoked from a System Loader with a different
+// signature so we can not directly hookup the original OnThreadExit which is
+// private member
+// so we make StaticMeta class share with the us the address of the function so
+// we can invoke it.
+namespace wintlscleanup {
+
+// This is set to OnThreadExit in StaticMeta singleton constructor
+UnrefHandler thread_local_inclass_routine = nullptr;
+pthread_key_t thread_local_key = pthread_key_t (-1);
+
+// Static callback function to call with each thread termination.
+void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
+  // We decided to punt on PROCESS_EXIT
+  if (DLL_THREAD_DETACH == reason) {
+    if (thread_local_key != pthread_key_t(-1) &&
+        thread_local_inclass_routine != nullptr) {
+      void* tls = TlsGetValue(thread_local_key);
+      if (tls != nullptr) {
+        thread_local_inclass_routine(tls);
+      }
+    }
+  }
+}
+
+}  // wintlscleanup
+
+// extern "C" suppresses C++ name mangling so we know the symbol name for the
+// linker /INCLUDE:symbol pragma above.
+extern "C" {
+
+#ifdef _MSC_VER
+// The linker must not discard thread_callback_on_exit.  (We force a reference
+// to this variable with a linker /include:symbol pragma to ensure that.) If
+// this variable is discarded, the OnThreadExit function will never be called.
+#ifndef _X86_
+
+// .CRT section is merged with .rdata on x64 so it must be constant data.
+#pragma const_seg(".CRT$XLB")
+// When defining a const variable, it must have external linkage to be sure the
+// linker doesn't discard it.
+extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit;
+const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit =
+    wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma const_seg()
+
+#pragma comment(linker, "/include:_tls_used")
+#pragma comment(linker, "/include:p_thread_callback_on_exit")
+
+#else  // _X86_
+
+#pragma data_seg(".CRT$XLB")
+PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma data_seg()
+
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit")
+
+#endif  // _X86_
+
+#else
+// https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc
+BOOL WINAPI DllMain(HINSTANCE h, DWORD dwReason, PVOID pv) {
+  if (dwReason == DLL_THREAD_DETACH)
+    wintlscleanup::WinOnThreadExit(h, dwReason, pv);
+  return TRUE;
+}
+#endif
+}  // extern "C"
+
+#endif  // OS_WIN
+
+void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::Instance(); }
+
+ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() {
+  // Here we prefer function static variable instead of global
+  // static variable as function static variable is initialized
+  // when the function is first call.  As a result, we can properly
+  // control their construction order by properly preparing their
+  // first function call.
+  //
+  // Note that here we decide to make "inst" a static pointer w/o deleting
+  // it at the end instead of a static variable.  This is to avoid the following
+  // destruction order disaster happens when a child thread using ThreadLocalPtr
+  // dies AFTER the main thread dies:  When a child thread happens to use
+  // ThreadLocalPtr, it will try to delete its thread-local data on its
+  // OnThreadExit when the child thread dies.  However, OnThreadExit depends
+  // on the following variable.  As a result, if the main thread dies before any
+  // child thread happen to use ThreadLocalPtr dies, then the destruction of
+  // the following variable will go first, then OnThreadExit, therefore causing
+  // invalid access.
+  //
+  // The above problem can be solved by using thread_local to store tls_ instead
+  // of using __thread.  The major difference between thread_local and __thread
+  // is that thread_local supports dynamic construction and destruction of
+  // non-primitive typed variables.  As a result, we can guarantee the
+  // destruction order even when the main thread dies before any child threads.
+  // However, thread_local is not supported in all compilers that accept -std=c++11
+  // (e.g., eg Mac with XCode < 8. XCode 8+ supports thread_local).
+  static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta();
+  return inst;
+}
+
+port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; }
+
+void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
+  auto* tls = static_cast<ThreadData*>(ptr);
+  assert(tls != nullptr);
+
+  // Use the cached StaticMeta::Instance() instead of directly calling
+  // the variable inside StaticMeta::Instance() might already go out of
+  // scope here in case this OnThreadExit is called after the main thread
+  // dies.
+  auto* inst = tls->inst;
+  pthread_setspecific(inst->pthread_key_, nullptr);
+
+  MutexLock l(inst->MemberMutex());
+  inst->RemoveThreadData(tls);
+  // Unref stored pointers of current thread from all instances
+  uint32_t id = 0;
+  for (auto& e : tls->entries) {
+    void* raw = e.ptr.load();
+    if (raw != nullptr) {
+      auto unref = inst->GetHandler(id);
+      if (unref != nullptr) {
+        unref(raw);
+      }
+    }
+    ++id;
+  }
+  // Delete thread local structure no matter if it is Mac platform
+  delete tls;
+}
+
+ThreadLocalPtr::StaticMeta::StaticMeta()
+  : next_instance_id_(0),
+    head_(this),
+    pthread_key_(0) {
+  if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
+    abort();
+  }
+
+  // OnThreadExit is not getting called on the main thread.
+  // Call through the static destructor mechanism to avoid memory leak.
+  //
+  // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global
+  // singleton (destructors are invoked in reverse order of constructor
+  // _completion_); the latter must not mutate internal members. This
+  // cleanup mechanism inherently relies on use-after-release of the
+  // StaticMeta, and is brittle with respect to compiler-specific handling
+  // of memory backing destructed statically-scoped objects. Perhaps
+  // registering with atexit(3) would be more robust.
+  //
+// This is not required on Windows.
+#if !defined(OS_WIN)
+  static struct A {
+    ~A() {
+#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL
+      ThreadData* tls_ =
+        static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_));
+#endif
+      if (tls_) {
+        OnThreadExit(tls_);
+      }
+    }
+  } a;
+#endif  // !defined(OS_WIN)
+
+  head_.next = &head_;
+  head_.prev = &head_;
+
+#ifdef OS_WIN
+  // Share with Windows its cleanup routine and the key
+  wintlscleanup::thread_local_inclass_routine = OnThreadExit;
+  wintlscleanup::thread_local_key = pthread_key_;
+#endif
+}
+
+void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) {
+  Mutex()->AssertHeld();
+  d->next = &head_;
+  d->prev = head_.prev;
+  head_.prev->next = d;
+  head_.prev = d;
+}
+
+void ThreadLocalPtr::StaticMeta::RemoveThreadData(
+    ThreadData* d) {
+  Mutex()->AssertHeld();
+  d->next->prev = d->prev;
+  d->prev->next = d->next;
+  d->next = d->prev = d;
+}
+
+ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
+#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL
+  // Make this local variable name look like a member variable so that we
+  // can share all the code below
+  ThreadData* tls_ =
+      static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_));
+#endif
+
+  if (UNLIKELY(tls_ == nullptr)) {
+    auto* inst = Instance();
+    tls_ = new ThreadData(inst);
+    {
+      // Register it in the global chain, needs to be done before thread exit
+      // handler registration
+      MutexLock l(Mutex());
+      inst->AddThreadData(tls_);
+    }
+    // Even it is not OS_MACOSX, need to register value for pthread_key_ so that
+    // its exit handler will be triggered.
+    if (pthread_setspecific(inst->pthread_key_, tls_) != 0) {
+      {
+        MutexLock l(Mutex());
+        inst->RemoveThreadData(tls_);
+      }
+      delete tls_;
+      abort();
+    }
+  }
+  return tls_;
+}
+
+void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    return nullptr;
+  }
+  return tls->entries[id].ptr.load(std::memory_order_acquire);
+}
+
+void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(Mutex());
+    tls->entries.resize(id + 1);
+  }
+  tls->entries[id].ptr.store(ptr, std::memory_order_release);
+}
+
+void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(Mutex());
+    tls->entries.resize(id + 1);
+  }
+  return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire);
+}
+
+bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
+    void*& expected) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(Mutex());
+    tls->entries.resize(id + 1);
+  }
+  return tls->entries[id].ptr.compare_exchange_strong(
+      expected, ptr, std::memory_order_release, std::memory_order_relaxed);
+}
+
+void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
+    void* const replacement) {
+  MutexLock l(Mutex());
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr =
+          t->entries[id].ptr.exchange(replacement, std::memory_order_acquire);
+      if (ptr != nullptr) {
+        ptrs->push_back(ptr);
+      }
+    }
+  }
+}
+
+void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) {
+  MutexLock l(Mutex());
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr = t->entries[id].ptr.load();
+      if (ptr != nullptr) {
+        func(ptr, res);
+      }
+    }
+  }
+}
+
+uint32_t ThreadLocalPtr::TEST_PeekId() {
+  return Instance()->PeekId();
+}
+
+void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) {
+  MutexLock l(Mutex());
+  handler_map_[id] = handler;
+}
+
+UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) {
+  Mutex()->AssertHeld();
+  auto iter = handler_map_.find(id);
+  if (iter == handler_map_.end()) {
+    return nullptr;
+  }
+  return iter->second;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::GetId() {
+  MutexLock l(Mutex());
+  if (free_instance_ids_.empty()) {
+    return next_instance_id_++;
+  }
+
+  uint32_t id = free_instance_ids_.back();
+  free_instance_ids_.pop_back();
+  return id;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::PeekId() const {
+  MutexLock l(Mutex());
+  if (!free_instance_ids_.empty()) {
+    return free_instance_ids_.back();
+  }
+  return next_instance_id_;
+}
+
+void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) {
+  // This id is not used, go through all thread local data and release
+  // corresponding value
+  MutexLock l(Mutex());
+  auto unref = GetHandler(id);
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr = t->entries[id].ptr.exchange(nullptr);
+      if (ptr != nullptr && unref != nullptr) {
+        unref(ptr);
+      }
+    }
+  }
+  handler_map_[id] = nullptr;
+  free_instance_ids_.push_back(id);
+}
+
+ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler)
+    : id_(Instance()->GetId()) {
+  if (handler != nullptr) {
+    Instance()->SetHandler(id_, handler);
+  }
+}
+
+ThreadLocalPtr::~ThreadLocalPtr() {
+  Instance()->ReclaimId(id_);
+}
+
+void* ThreadLocalPtr::Get() const {
+  return Instance()->Get(id_);
+}
+
+void ThreadLocalPtr::Reset(void* ptr) {
+  Instance()->Reset(id_, ptr);
+}
+
+void* ThreadLocalPtr::Swap(void* ptr) {
+  return Instance()->Swap(id_, ptr);
+}
+
+bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) {
+  return Instance()->CompareAndSwap(id_, ptr, expected);
+}
+
+void ThreadLocalPtr::Scrape(autovector<void*>* ptrs, void* const replacement) {
+  Instance()->Scrape(id_, ptrs, replacement);
+}
+
+void ThreadLocalPtr::Fold(FoldFunc func, void* res) {
+  Instance()->Fold(id_, func, res);
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_local.h b/src/rocksdb/util/thread_local.h
new file mode 100644
index 00000000..5dad7292
--- /dev/null
+++ b/src/rocksdb/util/thread_local.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "util/autovector.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+// Cleanup function that will be called for a stored thread local
+// pointer (if not NULL) when one of the following happens:
+// (1) a thread terminates
+// (2) a ThreadLocalPtr is destroyed
+//
+// Warning: this function is called while holding a global mutex. The same mutex
+// is used (at least in some cases) by most methods of ThreadLocalPtr, and it's
+// shared across all instances of ThreadLocalPtr. Thereforere extra care
+// is needed to avoid deadlocks. In particular, the handler shouldn't lock any
+// mutexes and shouldn't call any methods of any ThreadLocalPtr instances,
+// unless you know what you're doing.
+typedef void (*UnrefHandler)(void* ptr);
+
+// ThreadLocalPtr stores only values of pointer type.  Different from
+// the usual thread-local-storage, ThreadLocalPtr has the ability to
+// distinguish data coming from different threads and different
+// ThreadLocalPtr instances.  For example, if a regular thread_local
+// variable A is declared in DBImpl, two DBImpl objects would share
+// the same A.  However, a ThreadLocalPtr that is defined under the
+// scope of DBImpl can avoid such confliction.  As a result, its memory
+// usage would be O(# of threads * # of ThreadLocalPtr instances).
+class ThreadLocalPtr {
+ public:
+  explicit ThreadLocalPtr(UnrefHandler handler = nullptr);
+
+  ThreadLocalPtr(const ThreadLocalPtr&) = delete;
+  ThreadLocalPtr& operator=(const ThreadLocalPtr&) = delete;
+
+  ~ThreadLocalPtr();
+
+  // Return the current pointer stored in thread local
+  void* Get() const;
+
+  // Set a new pointer value to the thread local storage.
+  void Reset(void* ptr);
+
+  // Atomically swap the supplied ptr and return the previous value
+  void* Swap(void* ptr);
+
+  // Atomically compare the stored value with expected. Set the new
+  // pointer value to thread local only if the comparison is true.
+  // Otherwise, expected returns the stored value.
+  // Return true on success, false on failure
+  bool CompareAndSwap(void* ptr, void*& expected);
+
+  // Reset all thread local data to replacement, and return non-nullptr
+  // data for all existing threads
+  void Scrape(autovector<void*>* ptrs, void* const replacement);
+
+  typedef std::function<void(void*, void*)> FoldFunc;
+  // Update res by applying func on each thread-local value. Holds a lock that
+  // prevents unref handler from running during this call, but clients must
+  // still provide external synchronization since the owning thread can
+  // access the values without internal locking, e.g., via Get() and Reset().
+  void Fold(FoldFunc func, void* res);
+
+  // Add here for testing
+  // Return the next available Id without claiming it
+  static uint32_t TEST_PeekId();
+
+  // Initialize the static singletons of the ThreadLocalPtr.
+  //
+  // If this function is not called, then the singletons will be
+  // automatically initialized when they are used.
+  //
+  // Calling this function twice or after the singletons have been
+  // initialized will be no-op.
+  static void InitSingletons();
+
+  class StaticMeta;
+
+private:
+
+  static StaticMeta* Instance();
+
+  const uint32_t id_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_local_test.cc b/src/rocksdb/util/thread_local_test.cc
new file mode 100644
index 00000000..789be83d
--- /dev/null
+++ b/src/rocksdb/util/thread_local_test.cc
@@ -0,0 +1,582 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <thread>
+#include <atomic>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/autovector.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/thread_local.h"
+
+namespace rocksdb {
+
+class ThreadLocalTest : public testing::Test {
+ public:
+  ThreadLocalTest() : env_(Env::Default()) {}
+
+  Env* env_;
+};
+
+namespace {
+
+struct Params {
+  Params(port::Mutex* m, port::CondVar* c, int* u, int n,
+         UnrefHandler handler = nullptr)
+      : mu(m),
+        cv(c),
+        unref(u),
+        total(n),
+        started(0),
+        completed(0),
+        doWrite(false),
+        tls1(handler),
+        tls2(nullptr) {}
+
+  port::Mutex* mu;
+  port::CondVar* cv;
+  int* unref;
+  int total;
+  int started;
+  int completed;
+  bool doWrite;
+  ThreadLocalPtr tls1;
+  ThreadLocalPtr* tls2;
+};
+
+class IDChecker : public ThreadLocalPtr {
+public:
+  static uint32_t PeekId() {
+    return TEST_PeekId();
+  }
+};
+
+}  // anonymous namespace
+
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+TEST_F(ThreadLocalTest, UniqueIdTest) {
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+  // New ThreadLocal instance bumps id by 1
+  {
+    // Id used 0
+    Params p1(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
+    // Id used 1
+    Params p2(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 2u);
+    // Id used 2
+    Params p3(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 3u);
+    // Id used 3
+    Params p4(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), 4u);
+  }
+  // id 3, 2, 1, 0 are in the free queue in order
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  // pick up 0
+  Params p1(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 1u);
+  // pick up 1
+  Params* p2 = new Params(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 2u);
+  // pick up 2
+  Params p3(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
+  // return up 1
+  delete p2;
+  ASSERT_EQ(IDChecker::PeekId(), 1u);
+  // Now we have 3, 1 in queue
+  // pick up 1
+  Params p4(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
+  // pick up 3
+  Params p5(&mu, &cv, nullptr, 1u);
+  // next new id
+  ASSERT_EQ(IDChecker::PeekId(), 4u);
+  // After exit, id sequence in queue:
+  // 3, 1, 2, 0
+}
+#endif  // __clang_analyzer__
+
+TEST_F(ThreadLocalTest, SequentialReadWriteTest) {
+  // global id list carries over 3, 1, 2, 0
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+  Params p(&mu, &cv, nullptr, 1);
+  ThreadLocalPtr tls2;
+  p.tls2 = &tls2;
+
+  auto func = [](void* ptr) {
+    auto& params = *static_cast<Params*>(ptr);
+
+    ASSERT_TRUE(params.tls1.Get() == nullptr);
+    params.tls1.Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1));
+    params.tls1.Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(2));
+
+    ASSERT_TRUE(params.tls2->Get() == nullptr);
+    params.tls2->Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(1));
+    params.tls2->Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(2));
+
+    params.mu->Lock();
+    ++(params.completed);
+    params.cv->SignalAll();
+    params.mu->Unlock();
+  };
+
+  for (int iter = 0; iter < 1024; ++iter) {
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
+    // Another new thread, read/write should not see value from previous thread
+    env_->StartThread(func, static_cast<void*>(&p));
+    mu.Lock();
+    while (p.completed != iter + 1) {
+      cv.Wait();
+    }
+    mu.Unlock();
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
+  }
+}
+
+TEST_F(ThreadLocalTest, ConcurrentReadWriteTest) {
+  // global id list carries over 3, 1, 2, 0
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  ThreadLocalPtr tls2;
+  port::Mutex mu1;
+  port::CondVar cv1(&mu1);
+  Params p1(&mu1, &cv1, nullptr, 16);
+  p1.tls2 = &tls2;
+
+  port::Mutex mu2;
+  port::CondVar cv2(&mu2);
+  Params p2(&mu2, &cv2, nullptr, 16);
+  p2.doWrite = true;
+  p2.tls2 = &tls2;
+
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    // Size_T switches size along with the ptr size
+    // we want to cast to.
+    size_t own = ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    // Let write threads write a different value from the read threads
+    if (p.doWrite) {
+      own += 8192;
+    }
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    auto* env = Env::Default();
+    auto start = env->NowMicros();
+
+    p.tls1.Reset(reinterpret_cast<size_t*>(own));
+    p.tls2->Reset(reinterpret_cast<size_t*>(own + 1));
+    // Loop for 1 second
+    while (env->NowMicros() - start < 1000 * 1000) {
+      for (int iter = 0; iter < 100000; ++iter) {
+        ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<size_t*>(own));
+        ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<size_t*>(own + 1));
+        if (p.doWrite) {
+          p.tls1.Reset(reinterpret_cast<size_t*>(own));
+          p.tls2->Reset(reinterpret_cast<size_t*>(own + 1));
+        }
+      }
+    }
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+    p.mu->Unlock();
+  };
+
+  // Initiate 2 instnaces: one keeps writing and one keeps reading.
+  // The read instance should not see data from the write instance.
+  // Each thread local copy of the value are also different from each
+  // other.
+  for (int th = 0; th < p1.total; ++th) {
+    env_->StartThread(func, static_cast<void*>(&p1));
+  }
+  for (int th = 0; th < p2.total; ++th) {
+    env_->StartThread(func, static_cast<void*>(&p2));
+  }
+
+  mu1.Lock();
+  while (p1.completed != p1.total) {
+    cv1.Wait();
+  }
+  mu1.Unlock();
+
+  mu2.Lock();
+  while (p2.completed != p2.total) {
+    cv2.Wait();
+  }
+  mu2.Unlock();
+
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
+}
+
+TEST_F(ThreadLocalTest, Unref) {
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
+
+  auto unref = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    p.mu->Lock();
+    ++(*p.unref);
+    p.mu->Unlock();
+  };
+
+  // Case 0: no unref triggered if ThreadLocalPtr is never accessed
+  auto func0 = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func0, static_cast<void*>(&p));
+    }
+    env_->WaitForJoin();
+    ASSERT_EQ(unref_count, 0);
+  }
+
+  // Case 1: unref triggered by thread exit
+  auto func1 = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    ThreadLocalPtr tls2(unref);
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = &tls2;
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func1, static_cast<void*>(&p));
+    }
+
+    env_->WaitForJoin();
+
+    // N threads x 2 ThreadLocal instance cleanup on thread exit
+    ASSERT_EQ(unref_count, 2 * p.total);
+  }
+
+  // Case 2: unref triggered by ThreadLocal instance destruction
+  auto func2 = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = new ThreadLocalPtr(unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func2, static_cast<void*>(&p));
+    }
+
+    // Wait for all threads to finish using Params
+    mu.Lock();
+    while (p.completed != p.total) {
+      cv.Wait();
+    }
+    mu.Unlock();
+
+    // Now destroy one ThreadLocal instance
+    delete p.tls2;
+    p.tls2 = nullptr;
+    // instance destroy for N threads
+    ASSERT_EQ(unref_count, p.total);
+
+    // Signal to exit
+    mu.Lock();
+    p.completed = 0;
+    cv.SignalAll();
+    mu.Unlock();
+    env_->WaitForJoin();
+    // additional N threads exit unref for the left instance
+    ASSERT_EQ(unref_count, 2 * p.total);
+  }
+}
+
+TEST_F(ThreadLocalTest, Swap) {
+  ThreadLocalPtr tls;
+  tls.Reset(reinterpret_cast<void*>(1));
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(nullptr)), 1);
+  ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(2)) == nullptr);
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Get()), 2);
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(reinterpret_cast<void*>(3))), 2);
+}
+
+TEST_F(ThreadLocalTest, Scrape) {
+  auto unref = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    p.mu->Lock();
+    ++(*p.unref);
+    p.mu->Unlock();
+  };
+
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = new ThreadLocalPtr(unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThread(func, static_cast<void*>(&p));
+    }
+
+    // Wait for all threads to finish using Params
+    mu.Lock();
+    while (p.completed != p.total) {
+      cv.Wait();
+    }
+    mu.Unlock();
+
+    ASSERT_EQ(unref_count, 0);
+
+    // Scrape all thread local data. No unref at thread
+    // exit or ThreadLocalPtr destruction
+    autovector<void*> ptrs;
+    p.tls1.Scrape(&ptrs, nullptr);
+    p.tls2->Scrape(&ptrs, nullptr);
+    delete p.tls2;
+    // Signal to exit
+    mu.Lock();
+    p.completed = 0;
+    cv.SignalAll();
+    mu.Unlock();
+    env_->WaitForJoin();
+
+    ASSERT_EQ(unref_count, 0);
+  }
+}
+
+TEST_F(ThreadLocalTest, Fold) {
+  auto unref = [](void* ptr) {
+    delete static_cast<std::atomic<int64_t>*>(ptr);
+  };
+  static const int kNumThreads = 16;
+  static const int kItersPerThread = 10;
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+  Params params(&mu, &cv, nullptr, kNumThreads, unref);
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    p.tls1.Reset(new std::atomic<int64_t>(0));
+
+    for (int i = 0; i < kItersPerThread; ++i) {
+      static_cast<std::atomic<int64_t>*>(p.tls1.Get())->fetch_add(1);
+    }
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 0; th < params.total; ++th) {
+    env_->StartThread(func, static_cast<void*>(&params));
+  }
+
+  // Wait for all threads to finish using Params
+  mu.Lock();
+  while (params.completed != params.total) {
+    cv.Wait();
+  }
+  mu.Unlock();
+
+  // Verify Fold() behavior
+  int64_t sum = 0;
+  params.tls1.Fold(
+      [](void* ptr, void* res) {
+        auto sum_ptr = static_cast<int64_t*>(res);
+        *sum_ptr += static_cast<std::atomic<int64_t>*>(ptr)->load();
+      },
+      &sum);
+  ASSERT_EQ(sum, kNumThreads * kItersPerThread);
+
+  // Signal to exit
+  mu.Lock();
+  params.completed = 0;
+  cv.SignalAll();
+  mu.Unlock();
+  env_->WaitForJoin();
+}
+
+TEST_F(ThreadLocalTest, CompareAndSwap) {
+  ThreadLocalPtr tls;
+  ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(1)) == nullptr);
+  void* expected = reinterpret_cast<void*>(1);
+  // Swap in 2
+  ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+  expected = reinterpret_cast<void*>(100);
+  // Fail Swap, still 2
+  ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+  ASSERT_EQ(expected, reinterpret_cast<void*>(2));
+  // Swap in 3
+  expected = reinterpret_cast<void*>(2);
+  ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(3), expected));
+  ASSERT_EQ(tls.Get(), reinterpret_cast<void*>(3));
+}
+
+namespace {
+
+void* AccessThreadLocal(void* /*arg*/) {
+  TEST_SYNC_POINT("AccessThreadLocal:Start");
+  ThreadLocalPtr tlp;
+  tlp.Reset(new std::string("hello RocksDB"));
+  TEST_SYNC_POINT("AccessThreadLocal:End");
+  return nullptr;
+}
+
+}  // namespace
+
+// The following test is disabled as it requires manual steps to run it
+// correctly.
+//
+// Currently we have no way to acess SyncPoint w/o ASAN error when the
+// child thread dies after the main thread dies.  So if you manually enable
+// this test and only see an ASAN error on SyncPoint, it means you pass the
+// test.
+TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"AccessThreadLocal:Start", "MainThreadDiesFirst:End"},
+       {"PosixEnv::~PosixEnv():End", "AccessThreadLocal:End"}});
+
+  // Triggers the initialization of singletons.
+  Env::Default();
+
+#ifndef ROCKSDB_LITE
+  try {
+#endif  // ROCKSDB_LITE
+    rocksdb::port::Thread th(&AccessThreadLocal, nullptr);
+    th.detach();
+    TEST_SYNC_POINT("MainThreadDiesFirst:End");
+#ifndef ROCKSDB_LITE
+  } catch (const std::system_error& ex) {
+    std::cerr << "Start thread: " << ex.code() << std::endl;
+    FAIL();
+  }
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/thread_operation.h b/src/rocksdb/util/thread_operation.h
new file mode 100644
index 00000000..f1827da0
--- /dev/null
+++ b/src/rocksdb/util/thread_operation.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines the structures for thread operation and state.
+// Thread operations are used to describe high level action of a
+// thread such as doing compaction or flush, while thread state
+// are used to describe lower-level action such as reading /
+// writing a file or waiting for a mutex.  Operations and states
+// are designed to be independent.  Typically, a thread usually involves
+// in one operation and one state at any specific point in time.
+
+#pragma once
+
+#include "rocksdb/thread_status.h"
+
+#include <string>
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+// The structure that describes a major thread operation.
+struct OperationInfo {
+  const ThreadStatus::OperationType type;
+  const std::string name;
+};
+
+// The global operation table.
+//
+// When updating a status of a thread, the pointer of the OperationInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+//
+// Note that it's not designed to be constant as in the future we
+// might consider adding global count to the OperationInfo.
+static OperationInfo global_operation_table[] = {
+  {ThreadStatus::OP_UNKNOWN, ""},
+  {ThreadStatus::OP_COMPACTION, "Compaction"},
+  {ThreadStatus::OP_FLUSH, "Flush"}
+};
+
+struct OperationStageInfo {
+  const ThreadStatus::OperationStage stage;
+  const std::string name;
+};
+
+// A table maintains the mapping from stage type to stage string.
+// Note that the string must be changed accordingly when the
+// associated function name changed.
+static OperationStageInfo global_op_stage_table[] = {
+  {ThreadStatus::STAGE_UNKNOWN, ""},
+  {ThreadStatus::STAGE_FLUSH_RUN,
+      "FlushJob::Run"},
+  {ThreadStatus::STAGE_FLUSH_WRITE_L0,
+      "FlushJob::WriteLevel0Table"},
+  {ThreadStatus::STAGE_COMPACTION_PREPARE,
+      "CompactionJob::Prepare"},
+  {ThreadStatus::STAGE_COMPACTION_RUN,
+      "CompactionJob::Run"},
+  {ThreadStatus::STAGE_COMPACTION_PROCESS_KV,
+      "CompactionJob::ProcessKeyValueCompaction"},
+  {ThreadStatus::STAGE_COMPACTION_INSTALL,
+      "CompactionJob::Install"},
+  {ThreadStatus::STAGE_COMPACTION_SYNC_FILE,
+      "CompactionJob::FinishCompactionOutputFile"},
+  {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH,
+      "MemTableList::PickMemtablesToFlush"},
+  {ThreadStatus::STAGE_MEMTABLE_ROLLBACK,
+      "MemTableList::RollbackMemtableFlush"},
+  {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+      "MemTableList::TryInstallMemtableFlushResults"},
+};
+
+// The structure that describes a state.
+struct StateInfo {
+  const ThreadStatus::StateType type;
+  const std::string name;
+};
+
+// The global state table.
+//
+// When updating a status of a thread, the pointer of the StateInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+static StateInfo global_state_table[] = {
+  {ThreadStatus::STATE_UNKNOWN, ""},
+  {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"},
+};
+
+struct OperationProperty {
+  int code;
+  std::string name;
+};
+
+static OperationProperty compaction_operation_properties[] = {
+  {ThreadStatus::COMPACTION_JOB_ID, "JobID"},
+  {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"},
+  {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"},
+  {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"},
+  {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"},
+  {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"},
+};
+
+static OperationProperty flush_operation_properties[] = {
+  {ThreadStatus::FLUSH_JOB_ID, "JobID"},
+  {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"},
+  {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"}
+};
+
+#else
+
+struct OperationInfo {
+};
+
+struct StateInfo {
+};
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/threadpool_imp.cc b/src/rocksdb/util/threadpool_imp.cc
new file mode 100644
index 00000000..acac0063
--- /dev/null
+++ b/src/rocksdb/util/threadpool_imp.cc
@@ -0,0 +1,508 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/threadpool_imp.h"
+
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+
+#ifndef OS_WIN
+#  include <unistd.h>
+#endif
+
+#ifdef OS_LINUX
+#  include <sys/syscall.h>
+#  include <sys/resource.h>
+#endif
+
+#include <stdlib.h>
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+namespace rocksdb {
+
+void ThreadPoolImpl::PthreadCall(const char* label, int result) {
+  if (result != 0) {
+    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    abort();
+  }
+}
+
+struct ThreadPoolImpl::Impl {
+
+  Impl();
+  ~Impl();
+
+  void JoinThreads(bool wait_for_jobs_to_complete);
+
+  void SetBackgroundThreadsInternal(int num, bool allow_reduce);
+  int GetBackgroundThreads();
+
+  unsigned int GetQueueLen() const {
+    return queue_len_.load(std::memory_order_relaxed);
+  }
+
+  void LowerIOPriority();
+
+  void LowerCPUPriority();
+
+  void WakeUpAllThreads() {
+    bgsignal_.notify_all();
+  }
+
+  void BGThread(size_t thread_id);
+
+  void StartBGThreads();
+
+  void Submit(std::function<void()>&& schedule,
+    std::function<void()>&& unschedule, void* tag);
+
+  int UnSchedule(void* arg);
+
+  void SetHostEnv(Env* env) { env_ = env; }
+
+  Env* GetHostEnv() const { return env_; }
+
+  bool HasExcessiveThread() const {
+    return static_cast<int>(bgthreads_.size()) > total_threads_limit_;
+  }
+
+  // Return true iff the current thread is the excessive thread to terminate.
+  // Always terminate the running thread that is added last, even if there are
+  // more than one thread to terminate.
+  bool IsLastExcessiveThread(size_t thread_id) const {
+    return HasExcessiveThread() && thread_id == bgthreads_.size() - 1;
+  }
+
+  bool IsExcessiveThread(size_t thread_id) const {
+    return static_cast<int>(thread_id) >= total_threads_limit_;
+  }
+
+  // Return the thread priority.
+  // This would allow its member-thread to know its priority.
+  Env::Priority GetThreadPriority() const { return priority_; }
+
+  // Set the thread priority.
+  void SetThreadPriority(Env::Priority priority) { priority_ = priority; }
+
+private:
+
+  static void* BGThreadWrapper(void* arg);
+
+  bool low_io_priority_;
+  bool low_cpu_priority_;
+  Env::Priority priority_;
+  Env*         env_;
+
+  int total_threads_limit_;
+  std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
+  bool exit_all_threads_;
+  bool wait_for_jobs_to_complete_;
+
+  // Entry per Schedule()/Submit() call
+  struct BGItem {
+    void* tag = nullptr;
+    std::function<void()> function;
+    std::function<void()> unschedFunction;
+  };
+
+  using BGQueue = std::deque<BGItem>;
+  BGQueue       queue_;
+
+  std::mutex               mu_;
+  std::condition_variable  bgsignal_;
+  std::vector<port::Thread> bgthreads_;
+};
+
+
+inline
+ThreadPoolImpl::Impl::Impl()
+    :
+      low_io_priority_(false),
+      low_cpu_priority_(false),
+      priority_(Env::LOW),
+      env_(nullptr),
+      total_threads_limit_(0),
+      queue_len_(),
+      exit_all_threads_(false),
+      wait_for_jobs_to_complete_(false),
+      queue_(),
+      mu_(),
+      bgsignal_(),
+      bgthreads_() {
+}
+
+inline
+ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); }
+
+void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) {
+
+  std::unique_lock<std::mutex> lock(mu_);
+  assert(!exit_all_threads_);
+
+  wait_for_jobs_to_complete_ = wait_for_jobs_to_complete;
+  exit_all_threads_ = true;
+  // prevent threads from being recreated right after they're joined, in case
+  // the user is concurrently submitting jobs.
+  total_threads_limit_ = 0;
+
+  lock.unlock();
+
+  bgsignal_.notify_all();
+
+  for (auto& th : bgthreads_) {
+    th.join();
+  }
+
+  bgthreads_.clear();
+
+  exit_all_threads_ = false;
+  wait_for_jobs_to_complete_ = false;
+}
+
+inline
+void ThreadPoolImpl::Impl::LowerIOPriority() {
+  std::lock_guard<std::mutex> lock(mu_);
+  low_io_priority_ = true;
+}
+
+inline
+void ThreadPoolImpl::Impl::LowerCPUPriority() {
+  std::lock_guard<std::mutex> lock(mu_);
+  low_cpu_priority_ = true;
+}
+
+void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
+  bool low_io_priority = false;
+  bool low_cpu_priority = false;
+
+  while (true) {
+    // Wait until there is an item that is ready to run
+    std::unique_lock<std::mutex> lock(mu_);
+    // Stop waiting if the thread needs to do work or needs to terminate.
+    while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
+           (queue_.empty() || IsExcessiveThread(thread_id))) {
+      bgsignal_.wait(lock);
+    }
+
+    if (exit_all_threads_) {  // mechanism to let BG threads exit safely
+
+      if (!wait_for_jobs_to_complete_ ||
+          queue_.empty()) {
+        break;
+       }
+    }
+
+    if (IsLastExcessiveThread(thread_id)) {
+      // Current thread is the last generated one and is excessive.
+      // We always terminate excessive thread in the reverse order of
+      // generation time.
+      auto& terminating_thread = bgthreads_.back();
+      terminating_thread.detach();
+      bgthreads_.pop_back();
+
+      if (HasExcessiveThread()) {
+        // There is still at least more excessive thread to terminate.
+        WakeUpAllThreads();
+      }
+      break;
+    }
+
+    auto func = std::move(queue_.front().function);
+    queue_.pop_front();
+
+    queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                     std::memory_order_relaxed);
+
+    bool decrease_io_priority = (low_io_priority != low_io_priority_);
+    bool decrease_cpu_priority = (low_cpu_priority != low_cpu_priority_);
+    lock.unlock();
+
+#ifdef OS_LINUX
+    if (decrease_cpu_priority) {
+      setpriority(
+          PRIO_PROCESS,
+          // Current thread.
+          0,
+          // Lowest priority possible.
+          19);
+      low_cpu_priority = true;
+    }
+
+    if (decrease_io_priority) {
+#define IOPRIO_CLASS_SHIFT (13)
+#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
+      // Put schedule into IOPRIO_CLASS_IDLE class (lowest)
+      // These system calls only have an effect when used in conjunction
+      // with an I/O scheduler that supports I/O priorities. As at
+      // kernel 2.6.17 the only such scheduler is the Completely
+      // Fair Queuing (CFQ) I/O scheduler.
+      // To change scheduler:
+      //  echo cfq > /sys/block/<device_name>/queue/schedule
+      // Tunables to consider:
+      //  /sys/block/<device_name>/queue/slice_idle
+      //  /sys/block/<device_name>/queue/slice_sync
+      syscall(SYS_ioprio_set, 1,  // IOPRIO_WHO_PROCESS
+              0,                  // current thread
+              IOPRIO_PRIO_VALUE(3, 0));
+      low_io_priority = true;
+    }
+#else
+    (void)decrease_io_priority;  // avoid 'unused variable' error
+    (void)decrease_cpu_priority;
+#endif
+    func();
+  }
+}
+
+// Helper struct for passing arguments when creating threads.
+struct BGThreadMetadata {
+  ThreadPoolImpl::Impl* thread_pool_;
+  size_t thread_id_;  // Thread count in the thread.
+  BGThreadMetadata(ThreadPoolImpl::Impl* thread_pool, size_t thread_id)
+      : thread_pool_(thread_pool), thread_id_(thread_id) {}
+};
+
+void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
+  BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
+  size_t thread_id = meta->thread_id_;
+  ThreadPoolImpl::Impl* tp = meta->thread_pool_;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  // initialize it because compiler isn't good enough to see we don't use it
+  // uninitialized
+  ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES;
+  switch (tp->GetThreadPriority()) {
+    case Env::Priority::HIGH:
+      thread_type = ThreadStatus::HIGH_PRIORITY;
+      break;
+    case Env::Priority::LOW:
+      thread_type = ThreadStatus::LOW_PRIORITY;
+      break;
+    case Env::Priority::BOTTOM:
+      thread_type = ThreadStatus::BOTTOM_PRIORITY;
+      break;
+    case Env::Priority::USER:
+      thread_type = ThreadStatus::USER;
+      break;
+    case Env::Priority::TOTAL:
+      assert(false);
+      return nullptr;
+  }
+  assert(thread_type != ThreadStatus::NUM_THREAD_TYPES);
+  ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type);
+#endif
+  delete meta;
+  tp->BGThread(thread_id);
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  ThreadStatusUtil::UnregisterThread();
+#endif
+  return nullptr;
+}
+
+void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num,
+  bool allow_reduce) {
+  std::unique_lock<std::mutex> lock(mu_);
+  if (exit_all_threads_) {
+    lock.unlock();
+    return;
+  }
+  if (num > total_threads_limit_ ||
+      (num < total_threads_limit_ && allow_reduce)) {
+    total_threads_limit_ = std::max(0, num);
+    WakeUpAllThreads();
+    StartBGThreads();
+  }
+}
+
+int ThreadPoolImpl::Impl::GetBackgroundThreads() {
+  std::unique_lock<std::mutex> lock(mu_);
+  return total_threads_limit_;
+}
+
+void ThreadPoolImpl::Impl::StartBGThreads() {
+  // Start background thread if necessary
+  while ((int)bgthreads_.size() < total_threads_limit_) {
+
+    port::Thread p_t(&BGThreadWrapper,
+      new BGThreadMetadata(this, bgthreads_.size()));
+
+// Set the thread name to aid debugging
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+    auto th_handle = p_t.native_handle();
+    std::string thread_priority = Env::PriorityToString(GetThreadPriority());
+    std::ostringstream thread_name_stream;
+    thread_name_stream << "rocksdb:";
+    for (char c : thread_priority) {
+      thread_name_stream << static_cast<char>(tolower(c));
+    }
+    thread_name_stream << bgthreads_.size();
+    pthread_setname_np(th_handle, thread_name_stream.str().c_str());
+#endif
+#endif
+    bgthreads_.push_back(std::move(p_t));
+  }
+}
+
+void ThreadPoolImpl::Impl::Submit(std::function<void()>&& schedule,
+  std::function<void()>&& unschedule, void* tag) {
+
+  std::lock_guard<std::mutex> lock(mu_);
+
+  if (exit_all_threads_) {
+    return;
+  }
+
+  StartBGThreads();
+
+  // Add to priority queue
+  queue_.push_back(BGItem());
+
+  auto& item = queue_.back();
+  item.tag = tag;
+  item.function = std::move(schedule);
+  item.unschedFunction = std::move(unschedule);
+
+  queue_len_.store(static_cast<unsigned int>(queue_.size()),
+    std::memory_order_relaxed);
+
+  if (!HasExcessiveThread()) {
+    // Wake up at least one waiting thread.
+    bgsignal_.notify_one();
+  } else {
+    // Need to wake up all threads to make sure the one woken
+    // up is not the one to terminate.
+    WakeUpAllThreads();
+  }
+}
+
+int ThreadPoolImpl::Impl::UnSchedule(void* arg) {
+  int count = 0;
+
+  std::vector<std::function<void()>> candidates;
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    // Remove from priority queue
+    BGQueue::iterator it = queue_.begin();
+    while (it != queue_.end()) {
+      if (arg == (*it).tag) {
+        if (it->unschedFunction) {
+          candidates.push_back(std::move(it->unschedFunction));
+        }
+        it = queue_.erase(it);
+        count++;
+      } else {
+        ++it;
+      }
+    }
+    queue_len_.store(static_cast<unsigned int>(queue_.size()),
+      std::memory_order_relaxed);
+  }
+
+
+ // Run unschedule functions outside the mutex
+  for (auto& f : candidates) {
+    f();
+  }
+
+  return count;
+}
+
+ThreadPoolImpl::ThreadPoolImpl() :
+  impl_(new Impl()) {
+}
+
+
+ThreadPoolImpl::~ThreadPoolImpl() {
+}
+
+void ThreadPoolImpl::JoinAllThreads() {
+  impl_->JoinThreads(false);
+}
+
+void ThreadPoolImpl::SetBackgroundThreads(int num) {
+  impl_->SetBackgroundThreadsInternal(num, true);
+}
+
+int ThreadPoolImpl::GetBackgroundThreads() {
+  return impl_->GetBackgroundThreads();
+}
+
+unsigned int ThreadPoolImpl::GetQueueLen() const {
+  return impl_->GetQueueLen();
+}
+
+void ThreadPoolImpl::WaitForJobsAndJoinAllThreads() {
+  impl_->JoinThreads(true);
+}
+
+void ThreadPoolImpl::LowerIOPriority() {
+  impl_->LowerIOPriority();
+}
+
+void ThreadPoolImpl::LowerCPUPriority() {
+  impl_->LowerCPUPriority();
+}
+
+void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) {
+  impl_->SetBackgroundThreadsInternal(num, false);
+}
+
+void ThreadPoolImpl::SubmitJob(const std::function<void()>& job) {
+  auto copy(job);
+  impl_->Submit(std::move(copy), std::function<void()>(), nullptr);
+}
+
+
+void ThreadPoolImpl::SubmitJob(std::function<void()>&& job) {
+  impl_->Submit(std::move(job), std::function<void()>(), nullptr);
+}
+
+void ThreadPoolImpl::Schedule(void(*function)(void* arg1), void* arg,
+  void* tag, void(*unschedFunction)(void* arg)) {
+  if (unschedFunction == nullptr) {
+    impl_->Submit(std::bind(function, arg), std::function<void()>(), tag);
+  } else {
+    impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg),
+                  tag);
+  }
+}
+
+int ThreadPoolImpl::UnSchedule(void* arg) {
+  return impl_->UnSchedule(arg);
+}
+
+void ThreadPoolImpl::SetHostEnv(Env* env) { impl_->SetHostEnv(env); }
+
+Env* ThreadPoolImpl::GetHostEnv() const { return impl_->GetHostEnv(); }
+
+// Return the thread priority.
+// This would allow its member-thread to know its priority.
+Env::Priority ThreadPoolImpl::GetThreadPriority() const {
+  return impl_->GetThreadPriority();
+}
+
+// Set the thread priority.
+void ThreadPoolImpl::SetThreadPriority(Env::Priority priority) {
+  impl_->SetThreadPriority(priority);
+}
+
+ThreadPool* NewThreadPool(int num_threads) {
+  ThreadPoolImpl* thread_pool = new ThreadPoolImpl();
+  thread_pool->SetBackgroundThreads(num_threads);
+  return thread_pool;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/threadpool_imp.h b/src/rocksdb/util/threadpool_imp.h
new file mode 100644
index 00000000..3cdafb83
--- /dev/null
+++ b/src/rocksdb/util/threadpool_imp.h
@@ -0,0 +1,113 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "rocksdb/threadpool.h"
+#include "rocksdb/env.h"
+
+#include <memory>
+#include <functional>
+
+namespace rocksdb {
+
+
+class ThreadPoolImpl : public ThreadPool {
+ public:
+  ThreadPoolImpl();
+  ~ThreadPoolImpl();
+
+  ThreadPoolImpl(ThreadPoolImpl&&) = delete;
+  ThreadPoolImpl& operator=(ThreadPoolImpl&&) = delete;
+
+  // Implement ThreadPool interfaces
+
+  // Wait for all threads to finish.
+  // Discards all the jobs that did not
+  // start executing and waits for those running
+  // to complete
+  void JoinAllThreads() override;
+
+  // Set the number of background threads that will be executing the
+  // scheduled jobs.
+  void SetBackgroundThreads(int num) override;
+  int GetBackgroundThreads() override;
+
+  // Get the number of jobs scheduled in the ThreadPool queue.
+  unsigned int GetQueueLen() const override;
+
+  // Waits for all jobs to complete those
+  // that already started running and those that did not
+  // start yet
+  void WaitForJobsAndJoinAllThreads() override;
+
+  // Make threads to run at a lower kernel IO priority
+  // Currently only has effect on Linux
+  void LowerIOPriority();
+
+  // Make threads to run at a lower kernel CPU priority
+  // Currently only has effect on Linux
+  void LowerCPUPriority();
+
+  // Ensure there is at aleast num threads in the pool
+  // but do not kill threads if there are more
+  void IncBackgroundThreadsIfNeeded(int num);
+
+  // Submit a fire and forget job
+  // These jobs can not be unscheduled
+
+  // This allows to submit the same job multiple times
+  void SubmitJob(const std::function<void()>&) override;
+  // This moves the function in for efficiency
+  void SubmitJob(std::function<void()>&&) override;
+
+  // Schedule a job with an unschedule tag and unschedule function
+  // Can be used to filter and unschedule jobs by a tag
+  // that are still in the queue and did not start running
+  void Schedule(void (*function)(void* arg1), void* arg, void* tag,
+                void (*unschedFunction)(void* arg));
+
+  // Filter jobs that are still in a queue and match
+  // the given tag. Remove them from a queue if any
+  // and for each such job execute an unschedule function
+  // if such was given at scheduling time.
+  int UnSchedule(void* tag);
+
+  void SetHostEnv(Env* env);
+
+  Env* GetHostEnv() const;
+
+  // Return the thread priority.
+  // This would allow its member-thread to know its priority.
+  Env::Priority GetThreadPriority() const;
+
+  // Set the thread priority.
+  void SetThreadPriority(Env::Priority priority);
+
+  static void PthreadCall(const char* label, int result);
+
+  struct Impl;
+
+ private:
+
+   // Current public virtual interface does not provide usable
+   // functionality and thus can not be used internally to
+   // facade different implementations.
+   //
+   // We propose a pimpl idiom in order to easily replace the thread pool impl
+   // w/o touching the header file but providing a different .cc potentially
+   // CMake option driven.
+   //
+   // Another option is to introduce a Env::MakeThreadPool() virtual interface
+   // and override the environment. This would require refactoring ThreadPool usage.
+   //
+   // We can also combine these two approaches
+   std::unique_ptr<Impl>   impl_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/timer_queue.h b/src/rocksdb/util/timer_queue.h
new file mode 100644
index 00000000..bd8a4f85
--- /dev/null
+++ b/src/rocksdb/util/timer_queue.h
@@ -0,0 +1,230 @@
+//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law.  You can copy, modify, distribute and perform the
+// work, even for commercial purposes, all without asking permission.
+
+#pragma once
+
+#include <assert.h>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "port/port.h"
+#include "util/sync_point.h"
+
+// Allows execution of handlers at a specified time in the future
+// Guarantees:
+//  - All handlers are executed ONCE, even if cancelled (aborted parameter will
+// be set to true)
+//      - If TimerQueue is destroyed, it will cancel all handlers.
+//  - Handlers are ALWAYS executed in the Timer Queue worker thread.
+//  - Handlers execution order is NOT guaranteed
+//
+////////////////////////////////////////////////////////////////////////////////
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+class TimerQueue {
+ public:
+  TimerQueue() : m_th(&TimerQueue::run, this) {}
+
+  ~TimerQueue() { shutdown(); }
+
+  // This function is not thread-safe.
+  void shutdown() {
+    if (closed_) {
+      return;
+    }
+    cancelAll();
+    // Abusing the timer queue to trigger the shutdown.
+    add(0, [this](bool) {
+      m_finish = true;
+      return std::make_pair(false, 0);
+    });
+    m_th.join();
+    closed_ = true;
+  }
+
+  // Adds a new timer
+  // \return
+  //  Returns the ID of the new timer. You can use this ID to cancel the
+  // timer
+  uint64_t add(int64_t milliseconds,
+               std::function<std::pair<bool, int64_t>(bool)> handler) {
+    WorkItem item;
+    Clock::time_point tp = Clock::now();
+    item.end = tp + std::chrono::milliseconds(milliseconds);
+    TEST_SYNC_POINT_CALLBACK("TimeQueue::Add:item.end", &item.end);
+    item.period = milliseconds;
+    item.handler = std::move(handler);
+
+    std::unique_lock<std::mutex> lk(m_mtx);
+    uint64_t id = ++m_idcounter;
+    item.id = id;
+    m_items.push(std::move(item));
+
+    // Something changed, so wake up timer thread
+    m_checkWork.notify_one();
+    return id;
+  }
+
+  // Cancels the specified timer
+  // \return
+  //  1 if the timer was cancelled.
+  //  0 if you were too late to cancel (or the timer ID was never valid to
+  // start with)
+  size_t cancel(uint64_t id) {
+    // Instead of removing the item from the container (thus breaking the
+    // heap integrity), we set the item as having no handler, and put
+    // that handler on a new item at the top for immediate execution
+    // The timer thread will then ignore the original item, since it has no
+    // handler.
+    std::unique_lock<std::mutex> lk(m_mtx);
+    for (auto&& item : m_items.getContainer()) {
+      if (item.id == id && item.handler) {
+        WorkItem newItem;
+        // Zero time, so it stays at the top for immediate execution
+        newItem.end = Clock::time_point();
+        newItem.id = 0;  // Means it is a canceled item
+        // Move the handler from item to newitem (thus clearing item)
+        newItem.handler = std::move(item.handler);
+        m_items.push(std::move(newItem));
+
+        // Something changed, so wake up timer thread
+        m_checkWork.notify_one();
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+  // Cancels all timers
+  // \return
+  //  The number of timers cancelled
+  size_t cancelAll() {
+    // Setting all "end" to 0 (for immediate execution) is ok,
+    // since it maintains the heap integrity
+    std::unique_lock<std::mutex> lk(m_mtx);
+    m_cancel = true;
+    for (auto&& item : m_items.getContainer()) {
+      if (item.id && item.handler) {
+        item.end = Clock::time_point();
+        item.id = 0;
+      }
+    }
+    auto ret = m_items.size();
+
+    m_checkWork.notify_one();
+    return ret;
+  }
+
+ private:
+  using Clock = std::chrono::steady_clock;
+  TimerQueue(const TimerQueue&) = delete;
+  TimerQueue& operator=(const TimerQueue&) = delete;
+
+  void run() {
+    std::unique_lock<std::mutex> lk(m_mtx);
+    while (!m_finish) {
+      auto end = calcWaitTime_lock();
+      if (end.first) {
+        // Timers found, so wait until it expires (or something else
+        // changes)
+        m_checkWork.wait_until(lk, end.second);
+      } else {
+        // No timers exist, so wait forever until something changes
+        m_checkWork.wait(lk);
+      }
+
+      // Check and execute as much work as possible, such as, all expired
+      // timers
+      checkWork(&lk);
+    }
+
+    // If we are shutting down, we should not have any items left,
+    // since the shutdown cancels all items
+    assert(m_items.size() == 0);
+  }
+
+  std::pair<bool, Clock::time_point> calcWaitTime_lock() {
+    while (m_items.size()) {
+      if (m_items.top().handler) {
+        // Item present, so return the new wait time
+        return std::make_pair(true, m_items.top().end);
+      } else {
+        // Discard empty handlers (they were cancelled)
+        m_items.pop();
+      }
+    }
+
+    // No items found, so return no wait time (causes the thread to wait
+    // indefinitely)
+    return std::make_pair(false, Clock::time_point());
+  }
+
+  void checkWork(std::unique_lock<std::mutex>* lk) {
+    while (m_items.size() && m_items.top().end <= Clock::now()) {
+      WorkItem item(m_items.top());
+      m_items.pop();
+
+      if (item.handler) {
+        (*lk).unlock();
+        auto reschedule_pair = item.handler(item.id == 0);
+        (*lk).lock();
+        if (!m_cancel && reschedule_pair.first) {
+          int64_t new_period = (reschedule_pair.second == -1)
+                                   ? item.period
+                                   : reschedule_pair.second;
+
+          item.period = new_period;
+          item.end = Clock::now() + std::chrono::milliseconds(new_period);
+          m_items.push(std::move(item));
+        }
+      }
+    }
+  }
+
+  bool m_finish = false;
+  bool m_cancel = false;
+  uint64_t m_idcounter = 0;
+  std::condition_variable m_checkWork;
+
+  struct WorkItem {
+    Clock::time_point end;
+    int64_t period;
+    uint64_t id;  // id==0 means it was cancelled
+    std::function<std::pair<bool, int64_t>(bool)> handler;
+    bool operator>(const WorkItem& other) const { return end > other.end; }
+  };
+
+  std::mutex m_mtx;
+  // Inheriting from priority_queue, so we can access the internal container
+  class Queue : public std::priority_queue<WorkItem, std::vector<WorkItem>,
+                                           std::greater<WorkItem>> {
+   public:
+    std::vector<WorkItem>& getContainer() { return this->c; }
+  } m_items;
+  rocksdb::port::Thread m_th;
+  bool closed_ = false;
+};
diff --git a/src/rocksdb/util/timer_queue_test.cc b/src/rocksdb/util/timer_queue_test.cc
new file mode 100644
index 00000000..5f5f08f2
--- /dev/null
+++ b/src/rocksdb/util/timer_queue_test.cc
@@ -0,0 +1,72 @@
+//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free
+// to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law.  You can copy, modify, distribute and perform the
+// work, even for
+// commercial purposes, all without asking permission. See Other Information
+// below.
+//
+
+#include "util/timer_queue.h"
+#include <future>
+
+namespace Timing {
+
+using Clock = std::chrono::high_resolution_clock;
+double now() {
+  static auto start = Clock::now();
+  return std::chrono::duration<double, std::milli>(Clock::now() - start)
+      .count();
+}
+
+}  // namespace Timing
+
+int main() {
+  TimerQueue q;
+
+  double tnow = Timing::now();
+
+  q.add(10000, [tnow](bool aborted) mutable {
+    printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(false, 0);
+  });
+  q.add(10001, [tnow](bool aborted) mutable {
+    printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(false, 0);
+  });
+
+  q.add(1000, [tnow](bool aborted) mutable {
+    printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(!aborted, 1000);
+  });
+
+  auto id = q.add(2000, [tnow](bool aborted) mutable {
+    printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(!aborted, 2000);
+  });
+
+  (void)id;
+  // auto ret = q.cancel(id);
+  // assert(ret == 1);
+  // q.cancelAll();
+
+  return 0;
+}
+//////////////////////////////////////////
diff --git a/src/rocksdb/util/trace_replay.cc b/src/rocksdb/util/trace_replay.cc
new file mode 100644
index 00000000..28160b29
--- /dev/null
+++ b/src/rocksdb/util/trace_replay.cc
@@ -0,0 +1,300 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/trace_replay.h"
+
+#include <chrono>
+#include <sstream>
+#include <thread>
+#include "db/db_impl.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+const std::string kTraceMagic = "feedcafedeadbeef";
+
+namespace {
+void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) {
+  PutFixed32(dst, cf_id);
+  PutLengthPrefixedSlice(dst, key);
+}
+
+void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
+  Slice buf(buffer);
+  GetFixed32(&buf, cf_id);
+  GetLengthPrefixedSlice(&buf, key);
+}
+}  // namespace
+
+Tracer::Tracer(Env* env, const TraceOptions& trace_options,
+               std::unique_ptr<TraceWriter>&& trace_writer)
+    : env_(env),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)),
+      trace_request_count_ (0) {
+  WriteHeader();
+}
+
+Tracer::~Tracer() { trace_writer_.reset(); }
+
+Status Tracer::Write(WriteBatch* write_batch) {
+  TraceType trace_type = kTraceWrite;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  trace.payload = write_batch->Data();
+  return WriteTrace(trace);
+}
+
+Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) {
+  TraceType trace_type = kTraceGet;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  EncodeCFAndKey(&trace.payload, column_family->GetID(), key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) {
+  TraceType trace_type = kTraceIteratorSeek;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  EncodeCFAndKey(&trace.payload, cf_id, key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) {
+  TraceType trace_type = kTraceIteratorSeekForPrev;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  EncodeCFAndKey(&trace.payload, cf_id, key);
+  return WriteTrace(trace);
+}
+
+bool Tracer::ShouldSkipTrace(const TraceType& trace_type) {
+  if (IsTraceFileOverMax()) {
+    return true;
+  }
+  if ((trace_options_.filter & kTraceFilterGet
+    && trace_type == kTraceGet)
+   || (trace_options_.filter & kTraceFilterWrite
+    && trace_type == kTraceWrite)) {
+    return true;
+  }
+  ++trace_request_count_;
+  if (trace_request_count_ < trace_options_.sampling_frequency) {
+    return true;
+  }
+  trace_request_count_ = 0;
+  return false;
+}
+
+bool Tracer::IsTraceFileOverMax() {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  return (trace_file_size > trace_options_.max_trace_file_size);
+}
+
+Status Tracer::WriteHeader() {
+  std::ostringstream s;
+  s << kTraceMagic << "\t"
+    << "Trace Version: 0.1\t"
+    << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
+    << "Format: Timestamp OpType Payload\n";
+  std::string header(s.str());
+
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceBegin;
+  trace.payload = header;
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteFooter() {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceEnd;
+  trace.payload = "";
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteTrace(const Trace& trace) {
+  std::string encoded_trace;
+  PutFixed64(&encoded_trace, trace.ts);
+  encoded_trace.push_back(trace.type);
+  PutFixed32(&encoded_trace, static_cast<uint32_t>(trace.payload.size()));
+  encoded_trace.append(trace.payload);
+  return trace_writer_->Write(Slice(encoded_trace));
+}
+
+Status Tracer::Close() { return WriteFooter(); }
+
+Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+                   std::unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {
+  assert(db != nullptr);
+  db_ = static_cast<DBImpl*>(db->GetRootDB());
+  for (ColumnFamilyHandle* cfh : handles) {
+    cf_map_[cfh->GetID()] = cfh;
+  }
+}
+
+Replayer::~Replayer() { trace_reader_.reset(); }
+
+Status Replayer::Replay() {
+  Status s;
+  Trace header;
+  s = ReadHeader(&header);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::chrono::system_clock::time_point replay_epoch =
+      std::chrono::system_clock::now();
+  WriteOptions woptions;
+  ReadOptions roptions;
+  Trace trace;
+  uint64_t ops = 0;
+  Iterator* single_iter = nullptr;
+  while (s.ok()) {
+    trace.reset();
+    s = ReadTrace(&trace);
+    if (!s.ok()) {
+      break;
+    }
+
+    std::this_thread::sleep_until(
+        replay_epoch + std::chrono::microseconds(trace.ts - header.ts));
+    if (trace.type == kTraceWrite) {
+      WriteBatch batch(trace.payload);
+      db_->Write(woptions, &batch);
+      ops++;
+    } else if (trace.type == kTraceGet) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      std::string value;
+      if (cf_id == 0) {
+        db_->Get(roptions, key, &value);
+      } else {
+        db_->Get(roptions, cf_map_[cf_id], key, &value);
+      }
+      ops++;
+    } else if (trace.type == kTraceIteratorSeek) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      if (cf_id == 0) {
+        single_iter = db_->NewIterator(roptions);
+      } else {
+        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+      }
+      single_iter->Seek(key);
+      ops++;
+      delete single_iter;
+    } else if (trace.type == kTraceIteratorSeekForPrev) {
+      // Currently, only support to call the Seek()
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      if (cf_id == 0) {
+        single_iter = db_->NewIterator(roptions);
+      } else {
+        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+      }
+      single_iter->SeekForPrev(key);
+      ops++;
+      delete single_iter;
+    } else if (trace.type == kTraceEnd) {
+      // Do nothing for now.
+      // TODO: Add some validations later.
+      break;
+    }
+  }
+
+  if (s.IsIncomplete()) {
+    // Reaching eof returns Incomplete status at the moment.
+    // Could happen when killing a process without calling EndTrace() API.
+    // TODO: Add better error handling.
+    return Status::OK();
+  }
+  return s;
+}
+
+Status Replayer::ReadHeader(Trace* header) {
+  assert(header != nullptr);
+  Status s = ReadTrace(header);
+  if (!s.ok()) {
+    return s;
+  }
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+Status Replayer::ReadFooter(Trace* footer) {
+  assert(footer != nullptr);
+  Status s = ReadTrace(footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer->type != kTraceEnd) {
+    return Status::Corruption("Corrupted trace file. Incorrect footer.");
+  }
+
+  // TODO: Add more validations later
+  return s;
+}
+
+Status Replayer::ReadTrace(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Slice enc_slice = Slice(encoded_trace);
+  GetFixed64(&enc_slice, &trace->ts);
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/trace_replay.h b/src/rocksdb/util/trace_replay.h
new file mode 100644
index 00000000..749ea2f6
--- /dev/null
+++ b/src/rocksdb/util/trace_replay.h
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/trace_reader_writer.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class DB;
+class DBImpl;
+class Slice;
+class WriteBatch;
+
+extern const std::string kTraceMagic;
+const unsigned int kTraceTimestampSize = 8;
+const unsigned int kTraceTypeSize = 1;
+const unsigned int kTracePayloadLengthSize = 4;
+const unsigned int kTraceMetadataSize =
+    kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
+
+enum TraceType : char {
+  kTraceBegin = 1,
+  kTraceEnd = 2,
+  kTraceWrite = 3,
+  kTraceGet = 4,
+  kTraceIteratorSeek = 5,
+  kTraceIteratorSeekForPrev = 6,
+  kTraceMax,
+};
+
+// TODO: This should also be made part of public interface to help users build
+// custom TracerReaders and TraceWriters.
+struct Trace {
+  uint64_t ts;
+  TraceType type;
+  std::string payload;
+
+  void reset() {
+    ts = 0;
+    type = kTraceMax;
+    payload.clear();
+  }
+};
+
+// Trace RocksDB operations using a TraceWriter.
+class Tracer {
+ public:
+  Tracer(Env* env, const TraceOptions& trace_options,
+         std::unique_ptr<TraceWriter>&& trace_writer);
+  ~Tracer();
+
+  Status Write(WriteBatch* write_batch);
+  Status Get(ColumnFamilyHandle* cfname, const Slice& key);
+  Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
+  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+  bool IsTraceFileOverMax();
+
+  Status Close();
+
+ private:
+  Status WriteHeader();
+  Status WriteFooter();
+  Status WriteTrace(const Trace& trace);
+  bool ShouldSkipTrace(const TraceType& type);
+
+  Env* env_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+  uint64_t trace_request_count_;
+};
+
+// Replay RocksDB operations from a trace.
+class Replayer {
+ public:
+  Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+           std::unique_ptr<TraceReader>&& reader);
+  ~Replayer();
+
+  Status Replay();
+
+ private:
+  Status ReadHeader(Trace* header);
+  Status ReadFooter(Trace* footer);
+  Status ReadTrace(Trace* trace);
+
+  DBImpl* db_;
+  std::unique_ptr<TraceReader> trace_reader_;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/transaction_test_util.cc b/src/rocksdb/util/transaction_test_util.cc
new file mode 100644
index 00000000..30cff11e
--- /dev/null
+++ b/src/rocksdb/util/transaction_test_util.cc
@@ -0,0 +1,385 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "util/transaction_test_util.h"
+
+#include <inttypes.h>
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <string>
+#include <thread>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+#include "db/dbformat.h"
+#include "db/snapshot_impl.h"
+#include "util/logging.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+RandomTransactionInserter::RandomTransactionInserter(
+    Random64* rand, const WriteOptions& write_options,
+    const ReadOptions& read_options, uint64_t num_keys, uint16_t num_sets,
+    const uint64_t cmt_delay_ms, const uint64_t first_id)
+    : rand_(rand),
+      write_options_(write_options),
+      read_options_(read_options),
+      num_keys_(num_keys),
+      num_sets_(num_sets),
+      txn_id_(first_id),
+      cmt_delay_ms_(cmt_delay_ms) {}
+
+RandomTransactionInserter::~RandomTransactionInserter() {
+  if (txn_ != nullptr) {
+    delete txn_;
+  }
+  if (optimistic_txn_ != nullptr) {
+    delete optimistic_txn_;
+  }
+}
+
+bool RandomTransactionInserter::TransactionDBInsert(
+    TransactionDB* db, const TransactionOptions& txn_options) {
+  txn_ = db->BeginTransaction(write_options_, txn_options, txn_);
+
+  std::hash<std::thread::id> hasher;
+  char name[64];
+  snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%" PRIu64,
+           hasher(std::this_thread::get_id()), txn_id_++);
+  assert(strlen(name) < 64 - 1);
+  assert(txn_->SetName(name).ok());
+
+  // Take a snapshot if set_snapshot was not set or with 50% change otherwise
+  bool take_snapshot = txn_->GetSnapshot() == nullptr || rand_->OneIn(2);
+  if (take_snapshot) {
+    txn_->SetSnapshot();
+    read_options_.snapshot = txn_->GetSnapshot();
+  }
+  auto res = DoInsert(db, txn_, false);
+  if (take_snapshot) {
+    read_options_.snapshot = nullptr;
+  }
+  return res;
+}
+
+bool RandomTransactionInserter::OptimisticTransactionDBInsert(
+    OptimisticTransactionDB* db,
+    const OptimisticTransactionOptions& txn_options) {
+  optimistic_txn_ =
+      db->BeginTransaction(write_options_, txn_options, optimistic_txn_);
+
+  return DoInsert(db, optimistic_txn_, true);
+}
+
+bool RandomTransactionInserter::DBInsert(DB* db) {
+  return DoInsert(db, nullptr, false);
+}
+
+Status RandomTransactionInserter::DBGet(
+    DB* db, Transaction* txn, ReadOptions& read_options, uint16_t set_i,
+    uint64_t ikey, bool get_for_update, uint64_t* int_value,
+    std::string* full_key, bool* unexpected_error) {
+  Status s;
+  // Five digits (since the largest uint16_t is 65535) plus the NUL
+  // end char.
+  char prefix_buf[6];
+  // Pad prefix appropriately so we can iterate over each set
+  assert(set_i + 1 <= 9999);
+  snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1);
+  // key format:  [SET#][random#]
+  std::string skey = ToString(ikey);
+  Slice base_key(skey);
+  *full_key = std::string(prefix_buf) + base_key.ToString();
+  Slice key(*full_key);
+
+  std::string value;
+  if (txn != nullptr) {
+    if (get_for_update) {
+      s = txn->GetForUpdate(read_options, key, &value);
+    } else {
+      s = txn->Get(read_options, key, &value);
+    }
+  } else {
+    s = db->Get(read_options, key, &value);
+  }
+
+  if (s.ok()) {
+    // Found key, parse its value
+    *int_value = std::stoull(value);
+    if (*int_value == 0 || *int_value == ULONG_MAX) {
+      *unexpected_error = true;
+      fprintf(stderr, "Get returned unexpected value: %s\n", value.c_str());
+      s = Status::Corruption();
+    }
+  } else if (s.IsNotFound()) {
+    // Have not yet written to this key, so assume its value is 0
+    *int_value = 0;
+    s = Status::OK();
+  }
+  return s;
+}
+
+bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
+                                         bool is_optimistic) {
+  Status s;
+  WriteBatch batch;
+
+  // pick a random number to use to increment a key in each set
+  uint64_t incr = (rand_->Next() % 100) + 1;
+  bool unexpected_error = false;
+
+  std::vector<uint16_t> set_vec(num_sets_);
+  std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
+  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
+
+  // For each set, pick a key at random and increment it
+  for (uint16_t set_i : set_vec) {
+    uint64_t int_value = 0;
+    std::string full_key;
+    uint64_t rand_key = rand_->Next() % num_keys_;
+    const bool get_for_update = txn ? rand_->OneIn(2) : false;
+    s = DBGet(db, txn, read_options_, set_i, rand_key, get_for_update,
+              &int_value, &full_key, &unexpected_error);
+    Slice key(full_key);
+    if (!s.ok()) {
+      // Optimistic transactions should never return non-ok status here.
+      // Non-optimistic transactions may return write-coflict/timeout errors.
+      if (is_optimistic || !(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) {
+        fprintf(stderr, "Get returned an unexpected error: %s\n",
+                s.ToString().c_str());
+        unexpected_error = true;
+      }
+      break;
+    }
+
+    if (s.ok()) {
+      // Increment key
+      std::string sum = ToString(int_value + incr);
+      if (txn != nullptr) {
+        s = txn->Put(key, sum);
+        if (!get_for_update && (s.IsBusy() || s.IsTimedOut())) {
+          // If the initial get was not for update, then the key is not locked
+          // before put and put could fail due to concurrent writes.
+          break;
+        } else if (!s.ok()) {
+          // Since we did a GetForUpdate, Put should not fail.
+          fprintf(stderr, "Put returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+          unexpected_error = true;
+        }
+      } else {
+        batch.Put(key, sum);
+      }
+      bytes_inserted_ += key.size() + sum.size();
+    }
+    if (txn != nullptr) {
+      ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                      "Insert (%s) %s snap: %" PRIu64 " key:%s value: %" PRIu64
+                      "+%" PRIu64 "=%" PRIu64,
+                      txn->GetName().c_str(), s.ToString().c_str(),
+                      txn->GetSnapshot()->GetSequenceNumber(), full_key.c_str(),
+                      int_value, incr, int_value + incr);
+    }
+  }
+
+  if (s.ok()) {
+    if (txn != nullptr) {
+      bool with_prepare = !is_optimistic && !rand_->OneIn(10);
+      if (with_prepare) {
+        // Also try commit without prepare
+        s = txn->Prepare();
+        assert(s.ok());
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Prepare of %" PRIu64 " %s (%s)", txn->GetId(),
+                        s.ToString().c_str(), txn->GetName().c_str());
+        db->GetDBOptions().env->SleepForMicroseconds(
+            static_cast<int>(cmt_delay_ms_ * 1000));
+      }
+      if (!rand_->OneIn(20)) {
+        s = txn->Commit();
+        assert(!with_prepare || s.ok());
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Commit of %" PRIu64 " %s (%s)", txn->GetId(),
+                        s.ToString().c_str(), txn->GetName().c_str());
+      } else {
+        // Also try 5% rollback
+        s = txn->Rollback();
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Rollback %" PRIu64 " %s %s", txn->GetId(),
+                        txn->GetName().c_str(), s.ToString().c_str());
+        assert(s.ok());
+      }
+      assert(is_optimistic || s.ok());
+
+      if (!s.ok()) {
+        if (is_optimistic) {
+          // Optimistic transactions can have write-conflict errors on commit.
+          // Any other error is unexpected.
+          if (!(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) {
+            unexpected_error = true;
+          }
+        } else {
+          // Non-optimistic transactions should only fail due to expiration
+          // or write failures.  For testing purproses, we do not expect any
+          // write failures.
+          if (!s.IsExpired()) {
+            unexpected_error = true;
+          }
+        }
+
+        if (unexpected_error) {
+          fprintf(stderr, "Commit returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+        }
+      }
+    } else {
+      s = db->Write(write_options_, &batch);
+      if (!s.ok()) {
+        unexpected_error = true;
+        fprintf(stderr, "Write returned an unexpected error: %s\n",
+                s.ToString().c_str());
+      }
+    }
+  } else {
+    if (txn != nullptr) {
+      assert(txn->Rollback().ok());
+      ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Error %s for txn %s",
+                      s.ToString().c_str(), txn->GetName().c_str());
+    }
+  }
+
+  if (s.ok()) {
+    success_count_++;
+  } else {
+    failure_count_++;
+  }
+
+  last_status_ = s;
+
+  // return success if we didn't get any unexpected errors
+  return !unexpected_error;
+}
+
+// Verify that the sum of the keys in each set are equal
+Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets,
+                                         uint64_t num_keys_per_set,
+                                         bool take_snapshot, Random64* rand,
+                                         uint64_t delay_ms) {
+  // delay_ms is the delay between taking a snapshot and doing the reads. It
+  // emulates reads from a long-running backup job.
+  assert(delay_ms == 0 || take_snapshot);
+  uint64_t prev_total = 0;
+  uint32_t prev_i = 0;
+  bool prev_assigned = false;
+
+  ReadOptions roptions;
+  if (take_snapshot) {
+    roptions.snapshot = db->GetSnapshot();
+    db->GetDBOptions().env->SleepForMicroseconds(
+        static_cast<int>(delay_ms * 1000));
+  }
+
+  std::vector<uint16_t> set_vec(num_sets);
+  std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
+  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
+
+  // For each set of keys with the same prefix, sum all the values
+  for (uint16_t set_i : set_vec) {
+    // Five digits (since the largest uint16_t is 65535) plus the NUL
+    // end char.
+    char prefix_buf[6];
+    assert(set_i + 1 <= 9999);
+    snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1);
+    uint64_t total = 0;
+
+    // Use either point lookup or iterator. Point lookups are slower so we use
+    // it less often.
+    const bool use_point_lookup =
+        num_keys_per_set != 0 && rand && rand->OneIn(10);
+    if (use_point_lookup) {
+      ReadOptions read_options;
+      for (uint64_t k = 0; k < num_keys_per_set; k++) {
+        std::string dont_care;
+        uint64_t int_value = 0;
+        bool unexpected_error = false;
+        const bool FOR_UPDATE = false;
+        Status s = DBGet(db, nullptr, roptions, set_i, k, FOR_UPDATE,
+                         &int_value, &dont_care, &unexpected_error);
+        assert(s.ok());
+        assert(!unexpected_error);
+        total += int_value;
+      }
+    } else {  // user iterators
+      Iterator* iter = db->NewIterator(roptions);
+      for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        // stop when we reach a different prefix
+        if (key.ToString().compare(0, 4, prefix_buf) != 0) {
+          break;
+        }
+        Slice value = iter->value();
+        uint64_t int_value = std::stoull(value.ToString());
+        if (int_value == 0 || int_value == ULONG_MAX) {
+          fprintf(stderr, "Iter returned unexpected value: %s\n",
+                  value.ToString().c_str());
+          return Status::Corruption();
+        }
+        ROCKS_LOG_DEBUG(
+            db->GetDBOptions().info_log,
+            "VerifyRead at %" PRIu64 " (%" PRIu64 "): %.*s value: %" PRIu64,
+            roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul,
+            roptions.snapshot
+                ? ((SnapshotImpl*)roptions.snapshot)->min_uncommitted_
+                : 0ul,
+            static_cast<int>(key.size()), key.data(), int_value);
+        total += int_value;
+      }
+      delete iter;
+    }
+
+    if (prev_assigned && total != prev_total) {
+      db->GetDBOptions().info_log->Flush();
+      fprintf(stdout,
+              "RandomTransactionVerify found inconsistent totals using "
+              "pointlookup? %d "
+              "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64
+              " at snapshot %" PRIu64 "\n",
+              use_point_lookup, prev_i, prev_total, set_i, total,
+              roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul);
+      fflush(stdout);
+      return Status::Corruption();
+    } else {
+      ROCKS_LOG_DEBUG(
+          db->GetDBOptions().info_log,
+          "RandomTransactionVerify pass pointlookup? %d total: %" PRIu64
+          " snap: %" PRIu64,
+          use_point_lookup, total,
+          roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul);
+    }
+    prev_total = total;
+    prev_i = set_i;
+    prev_assigned = true;
+  }
+  if (take_snapshot) {
+    db->ReleaseSnapshot(roptions.snapshot);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/transaction_test_util.h b/src/rocksdb/util/transaction_test_util.h
new file mode 100644
index 00000000..1aa4196a
--- /dev/null
+++ b/src/rocksdb/util/transaction_test_util.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/options.h"
+#include "port/port.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+namespace rocksdb {
+
+class DB;
+class Random64;
+
+// Utility class for stress testing transactions.  Can be used to write many
+// transactions in parallel and then validate that the data written is logically
+// consistent.  This class assumes the input DB is initially empty.
+//
+// Each call to TransactionDBInsert()/OptimisticTransactionDBInsert() will
+// increment the value of a key in #num_sets sets of keys.  Regardless of
+// whether the transaction succeeds, the total sum of values of keys in each
+// set is an invariant that should remain equal.
+//
+// After calling TransactionDBInsert()/OptimisticTransactionDBInsert() many
+// times, Verify() can be called to validate that the invariant holds.
+//
+// To test writing Transaction in parallel, multiple threads can create a
+// RandomTransactionInserter with similar arguments using the same DB.
+class RandomTransactionInserter {
+ public:
+  // num_keys is the number of keys in each set.
+  // num_sets is the number of sets of keys.
+  // cmt_delay_ms is the delay between prepare (if there is any) and commit
+  // first_id is the id of the first transaction
+  explicit RandomTransactionInserter(
+      Random64* rand, const WriteOptions& write_options = WriteOptions(),
+      const ReadOptions& read_options = ReadOptions(), uint64_t num_keys = 1000,
+      uint16_t num_sets = 3, const uint64_t cmt_delay_ms = 0,
+      const uint64_t first_id = 0);
+
+  ~RandomTransactionInserter();
+
+  // Increment a key in each set using a Transaction on a TransactionDB.
+  //
+  // Returns true if the transaction succeeded OR if any error encountered was
+  // expected (eg a write-conflict). Error status may be obtained by calling
+  // GetLastStatus();
+  bool TransactionDBInsert(
+      TransactionDB* db,
+      const TransactionOptions& txn_options = TransactionOptions());
+
+  // Increment a key in each set using a Transaction on an
+  // OptimisticTransactionDB
+  //
+  // Returns true if the transaction succeeded OR if any error encountered was
+  // expected (eg a write-conflict). Error status may be obtained by calling
+  // GetLastStatus();
+  bool OptimisticTransactionDBInsert(
+      OptimisticTransactionDB* db,
+      const OptimisticTransactionOptions& txn_options =
+          OptimisticTransactionOptions());
+  // Increment a key in each set without using a transaction.  If this function
+  // is called in parallel, then Verify() may fail.
+  //
+  // Returns true if the write succeeds.
+  // Error status may be obtained by calling GetLastStatus().
+  bool DBInsert(DB* db);
+
+  // Get the ikey'th key from set set_i
+  static Status DBGet(DB* db, Transaction* txn, ReadOptions& read_options,
+                      uint16_t set_i, uint64_t ikey, bool get_for_update,
+                      uint64_t* int_value, std::string* full_key,
+                      bool* unexpected_error);
+
+  // Returns OK if Invariant is true.
+  static Status Verify(DB* db, uint16_t num_sets, uint64_t num_keys_per_set = 0,
+                       bool take_snapshot = false, Random64* rand = nullptr,
+                       uint64_t delay_ms = 0);
+
+  // Returns the status of the previous Insert operation
+  Status GetLastStatus() { return last_status_; }
+
+  // Returns the number of successfully written calls to
+  // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert
+  uint64_t GetSuccessCount() { return success_count_; }
+
+  // Returns the number of calls to
+  // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert that did not
+  // write any data.
+  uint64_t GetFailureCount() { return failure_count_; }
+
+  // Returns the sum of user keys/values Put() to the DB.
+  size_t GetBytesInserted() { return bytes_inserted_; }
+
+ private:
+  // Input options
+  Random64* rand_;
+  const WriteOptions write_options_;
+  ReadOptions read_options_;
+  const uint64_t num_keys_;
+  const uint16_t num_sets_;
+
+  // Number of successful insert batches performed
+  uint64_t success_count_ = 0;
+
+  // Number of failed insert batches attempted
+  uint64_t failure_count_ = 0;
+
+  size_t bytes_inserted_ = 0;
+
+  // Status returned by most recent insert operation
+  Status last_status_;
+
+  // optimization: re-use allocated transaction objects.
+  Transaction* txn_ = nullptr;
+  Transaction* optimistic_txn_ = nullptr;
+
+  uint64_t txn_id_;
+  // The delay between ::Prepare and ::Commit
+  const uint64_t cmt_delay_ms_;
+
+  bool DoInsert(DB* db, Transaction* txn, bool is_optimistic);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/util/user_comparator_wrapper.h b/src/rocksdb/util/user_comparator_wrapper.h
new file mode 100644
index 00000000..43797709
--- /dev/null
+++ b/src/rocksdb/util/user_comparator_wrapper.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+
+namespace rocksdb {
+
+// Wrapper of user comparator, with auto increment to
+// perf_context.user_key_comparison_count.
+class UserComparatorWrapper final : public Comparator {
+ public:
+  explicit UserComparatorWrapper(const Comparator* const user_cmp)
+      : user_comparator_(user_cmp) {}
+  
+  ~UserComparatorWrapper() = default;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->Compare(a, b);
+  }
+
+  bool Equal(const Slice& a, const Slice& b) const override {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->Equal(a, b);
+  }
+
+  const char* Name() const override { return user_comparator_->Name(); }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    return user_comparator_->FindShortestSeparator(start, limit);
+  }
+
+  void FindShortSuccessor(std::string* key) const override {
+    return user_comparator_->FindShortSuccessor(key);
+  }
+
+  const Comparator* GetRootComparator() const override {
+    return user_comparator_->GetRootComparator();
+  }
+
+  bool IsSameLengthImmediateSuccessor(const Slice& s,
+                                      const Slice& t) const override {
+    return user_comparator_->IsSameLengthImmediateSuccessor(s, t);
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return user_comparator_->CanKeysWithDifferentByteContentsBeEqual();
+  }
+
+ private:
+  const Comparator* user_comparator_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/util.h b/src/rocksdb/util/util.h
new file mode 100644
index 00000000..a5fd3649
--- /dev/null
+++ b/src/rocksdb/util/util.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef FALLTHROUGH_INTENDED
+#if defined(__clang__)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#else
+#define FALLTHROUGH_INTENDED do {} while (0)
+#endif
+#endif
diff --git a/src/rocksdb/util/vector_iterator.h b/src/rocksdb/util/vector_iterator.h
new file mode 100644
index 00000000..da60eb22
--- /dev/null
+++ b/src/rocksdb/util/vector_iterator.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace rocksdb {
+
+// Iterator over a vector of keys/values
+class VectorIterator : public InternalIterator {
+ public:
+  VectorIterator(std::vector<std::string> keys, std::vector<std::string> values,
+                 const InternalKeyComparator* icmp)
+      : keys_(std::move(keys)),
+        values_(std::move(values)),
+        indexed_cmp_(icmp, &keys_),
+        current_(keys.size()) {
+    assert(keys_.size() == values_.size());
+
+    indices_.reserve(keys_.size());
+    for (size_t i = 0; i < keys_.size(); i++) {
+      indices_.push_back(i);
+    }
+    std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
+  }
+
+  virtual bool Valid() const override {
+    return !indices_.empty() && current_ < indices_.size();
+  }
+
+  virtual void SeekToFirst() override { current_ = 0; }
+  virtual void SeekToLast() override { current_ = indices_.size() - 1; }
+
+  virtual void Seek(const Slice& target) override {
+    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                indexed_cmp_) -
+               indices_.begin();
+  }
+
+  virtual void SeekForPrev(const Slice& target) override {
+    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                indexed_cmp_) -
+               indices_.begin();
+    if (!Valid()) {
+      SeekToLast();
+    } else {
+      Prev();
+    }
+  }
+
+  virtual void Next() override { current_++; }
+  virtual void Prev() override { current_--; }
+
+  virtual Slice key() const override {
+    return Slice(keys_[indices_[current_]]);
+  }
+  virtual Slice value() const override {
+    return Slice(values_[indices_[current_]]);
+  }
+
+  virtual Status status() const override { return Status::OK(); }
+
+  virtual bool IsKeyPinned() const override { return true; }
+  virtual bool IsValuePinned() const override { return true; }
+
+ private:
+  struct IndexedKeyComparator {
+    IndexedKeyComparator(const InternalKeyComparator* c,
+                         const std::vector<std::string>* ks)
+        : cmp(c), keys(ks) {}
+
+    bool operator()(size_t a, size_t b) const {
+      return cmp->Compare((*keys)[a], (*keys)[b]) < 0;
+    }
+
+    bool operator()(size_t a, const Slice& b) const {
+      return cmp->Compare((*keys)[a], b) < 0;
+    }
+
+    bool operator()(const Slice& a, size_t b) const {
+      return cmp->Compare(a, (*keys)[b]) < 0;
+    }
+
+    const InternalKeyComparator* cmp;
+    const std::vector<std::string>* keys;
+  };
+
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  IndexedKeyComparator indexed_cmp_;
+  std::vector<size_t> indices_;
+  size_t current_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/xxhash.cc b/src/rocksdb/util/xxhash.cc
new file mode 100644
index 00000000..2ec95a63
--- /dev/null
+++ b/src/rocksdb/util/xxhash.cc
@@ -0,0 +1,1074 @@
+/*
+xxHash - Fast Hash algorithm
+Copyright (C) 2012-2014, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+
+//**************************************
+// Tuning parameters
+//**************************************
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable. Unfortunately, on some target/compiler combinations, the
+ * generated assembly is sub-optimal. The below switch allow to select different
+ * access method for improved performance. Method 0 (default) : use `memcpy()`.
+ * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
+ * extension (ie, not portable). This method is safe if your compiler supports
+ * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
+ * access. This method doesn't depend on compiler but violate C standard. It can
+ * generate buggy code on targets which do not support unaligned memory
+ * accesses. But in some circumstances, it's the only known way to get the most
+ * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947
+ * for details. Prefer these methods in priority order (0 > 1 > 2)
+ */
+
+#include "util/util.h"
+
+#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \
+                                   for example */
+#if defined(__GNUC__) &&                                     \
+    (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||  \
+     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
+     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))
+#define XXH_FORCE_MEMORY_ACCESS 2
+#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) ||      \
+    (defined(__GNUC__) &&                                     \
+     (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) ||  \
+      defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \
+      defined(__ARM_ARCH_7S__)))
+#define XXH_FORCE_MEMORY_ACCESS 1
+#endif
+#endif
+
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
+// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USE_UNALIGNED_ACCESS 1
+#endif
+
+// XXH_ACCEPT_NULL_INPUT_POINTER :
+// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+// This option has a very small performance cost (only measurable on small inputs).
+// By default, this option is disabled. To enable it, uncomment below define :
+//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
+
+// XXH_FORCE_NATIVE_FORMAT :
+// By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
+// Results are therefore identical for little-endian and big-endian CPU.
+// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+// Should endian-independence be of no importance for your application, you may set the #define below to 1.
+// It will improve speed for Big-endian CPU.
+// This option has no impact on Little_Endian CPU.
+#define XXH_FORCE_NATIVE_FORMAT 0
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \
+    defined(_M_X64)
+#define XXH_FORCE_ALIGN_CHECK 0
+#else
+#define XXH_FORCE_ALIGN_CHECK 1
+#endif
+#endif
+
+//**************************************
+// Compiler Specific Options
+//**************************************
+// Disable some Visual warning messages
+#ifdef _MSC_VER  // Visual Studio
+#  pragma warning(disable : 4127)      // disable: C4127: conditional expression is constant
+#  pragma warning(disable : 4804)      // disable: C4804: 'operation' : unsafe use of type 'bool' in operation (static assert line 313)
+#endif
+
+#ifdef _MSC_VER    // Visual Studio
+#  define FORCE_INLINE static __forceinline
+#else
+#  ifdef __GNUC__
+#    define FORCE_INLINE static inline __attribute__((always_inline))
+#  else
+#    define FORCE_INLINE static inline
+#  endif
+#endif
+
+
+//**************************************
+// Includes & Memory related functions
+//**************************************
+#include "xxhash.h"
+// Modify the local functions below should you wish to use some other memory related routines
+// for malloc(), free()
+#include <stdlib.h>
+FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); }
+FORCE_INLINE void  XXH_free  (void* p)  { free(p); }
+// for memcpy()
+#include <string.h>
+FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+#include <assert.h> /* assert */
+
+namespace rocksdb {
+//**************************************
+// Basic Types
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
+#endif
+
+#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
+#  define _PACKED __attribute__ ((packed))
+#else
+#  define _PACKED
+#endif
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  ifdef __IBMC__
+#    pragma pack(1)
+#  else
+#    pragma pack(push, 1)
+#  endif
+#endif
+
+typedef struct _U32_S { U32 v; } _PACKED U32_S;
+
+#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+#  pragma pack(pop)
+#endif
+
+#define A32(x) (((U32_S *)(x))->v)
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union {
+  U32 u32;
+} __attribute__((packed)) unalign;
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static U32 XXH_read32(const void* memPtr) {
+  U32 val;
+  memcpy(&val, memPtr, sizeof(val));
+  return val;
+}
+
+#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+//***************************************
+// Compiler-specific Functions and Macros
+//***************************************
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#define XXH_rotl64(x, r) _rotl64(x, r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     // Visual Studio
+#  define XXH_swap32 _byteswap_ulong
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static inline U32 XXH_swap32 (U32 x) {
+    return  ((x << 24) & 0xff000000 ) |
+        ((x <<  8) & 0x00ff0000 ) |
+        ((x >>  8) & 0x0000ff00 ) |
+        ((x >> 24) & 0x000000ff );}
+#endif
+
+
+//**************************************
+// Constants
+//**************************************
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+
+//**************************************
+// Architecture Macros
+//**************************************
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+#ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
+    static const int one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
+#endif
+
+
+//**************************************
+// Macros
+//**************************************
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
+
+
+//****************************
+// Memory reads
+//****************************
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
+    else
+        return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian,
+                                    XXH_alignment align) {
+  if (align == XXH_unaligned)
+    return endian == XXH_littleEndian ? XXH_read32(ptr)
+                                      : XXH_swap32(XXH_read32(ptr));
+  else
+    return endian == XXH_littleEndian ? *(const U32*)ptr
+                                      : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) {
+  return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+//****************************
+// Simple Hash Functions
+//****************************
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U32 h32;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; }
+#endif
+
+    if (len>=16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do
+        {
+            v1 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    }
+    else
+    {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32(const void* input, int len, U32 seed)
+{
+#if 0
+    // Simple version, good for code maintenance, but unfortunately slow for small inputs
+    void* state = XXH32_init(seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+    if ((((size_t)input) & 3))   // Input is aligned, let's leverage the speed advantage
+    {
+        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+        else
+            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+#  endif
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+struct XXH_state32_t
+{
+    U64 total_len;
+    U32 seed;
+    U32 v1;
+    U32 v2;
+    U32 v3;
+    U32 v4;
+    int memsize;
+    char memory[16];
+};
+
+
+int XXH32_sizeofState()
+{
+    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
+    return sizeof(struct XXH_state32_t);
+}
+
+
+XXH_errorcode XXH32_resetState(void* state_in, U32 seed)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    state->seed = seed;
+    state->v1 = seed + PRIME32_1 + PRIME32_2;
+    state->v2 = seed + PRIME32_2;
+    state->v3 = seed + 0;
+    state->v4 = seed - PRIME32_1;
+    state->total_len = 0;
+    state->memsize = 0;
+    return XXH_OK;
+}
+
+
+void* XXH32_init (U32 seed)
+{
+    void* state = XXH_malloc (sizeof(struct XXH_state32_t));
+    XXH32_resetState(state, seed);
+    return state;
+}
+
+
+FORCE_INLINE XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 16)   // fill in tmp buffer
+    {
+        XXH_memcpy(state->memory + state->memsize, input, len);
+        state->memsize +=  len;
+        return XXH_OK;
+    }
+
+    if (state->memsize)   // some data left from previous update
+    {
+        XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize);
+        {
+            const U32* p32 = (const U32*)state->memory;
+            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
+            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++;
+            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
+            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16)
+    {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do
+        {
+            v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
+            v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
+            v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
+            v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd)
+    {
+        XXH_memcpy(state->memory, p, bEnd-p);
+        state->memsize = (int)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE U32 XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian)
+{
+    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
+    const BYTE * p = (const BYTE*)state->memory;
+    BYTE* bEnd = (BYTE*)state->memory + state->memsize;
+    U32 h32;
+
+    if (state->total_len >= 16)
+    {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    }
+    else
+    {
+        h32  = state->seed + PRIME32_5;
+    }
+
+    h32 += (U32) state->total_len;
+
+    while (p<=bEnd-4)
+    {
+        h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd)
+    {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+U32 XXH32_intermediateDigest (void* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian);
+}
+
+
+U32 XXH32_digest (void* state_in)
+{
+    U32 h32 = XXH32_intermediateDigest(state_in);
+
+    XXH_free(state_in);
+
+    return h32;
+}
+
+/* *******************************************************************
+ *  64-bit hash functions
+ *********************************************************************/
+
+ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+ /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+ static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+ /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+ /* currently only defined for gcc and icc */
+ typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
+ static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+ #else
+
+ /* portable and safe solution. Generally efficient.
+  * see : http://stackoverflow.com/a/32095106/646947
+  */
+
+ static U64 XXH_read64(const void* memPtr)
+ {
+     U64 val;
+     memcpy(&val, memPtr, sizeof(val));
+     return val;
+ }
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER) /* Visual Studio */
+#define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#define XXH_swap64 __builtin_bswap64
+#else
+static U64 XXH_swap64(U64 x) {
+  return ((x << 56) & 0xff00000000000000ULL) |
+         ((x << 40) & 0x00ff000000000000ULL) |
+         ((x << 24) & 0x0000ff0000000000ULL) |
+         ((x << 8) & 0x000000ff00000000ULL) |
+         ((x >> 8) & 0x00000000ff000000ULL) |
+         ((x >> 24) & 0x0000000000ff0000ULL) |
+         ((x >> 40) & 0x000000000000ff00ULL) |
+         ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian,
+                                    XXH_alignment align) {
+  if (align == XXH_unaligned)
+    return endian == XXH_littleEndian ? XXH_read64(ptr)
+                                      : XXH_swap64(XXH_read64(ptr));
+  else
+    return endian == XXH_littleEndian ? *(const U64*)ptr
+                                      : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) {
+  return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr) {
+  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+/*======   xxh64   ======*/
+
+static const U64 PRIME64_1 =
+    11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
+                              */
+static const U64 PRIME64_2 =
+    14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
+                              */
+static const U64 PRIME64_3 =
+    1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
+                             */
+static const U64 PRIME64_4 =
+    9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
+                             */
+static const U64 PRIME64_5 =
+    2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
+                             */
+
+static U64 XXH64_round(U64 acc, U64 input) {
+  acc += input * PRIME64_2;
+  acc = XXH_rotl64(acc, 31);
+  acc *= PRIME64_1;
+  return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val) {
+  val = XXH64_round(0, val);
+  acc ^= val;
+  acc = acc * PRIME64_1 + PRIME64_4;
+  return acc;
+}
+
+static U64 XXH64_avalanche(U64 h64) {
+  h64 ^= h64 >> 33;
+  h64 *= PRIME64_2;
+  h64 ^= h64 >> 29;
+  h64 *= PRIME64_3;
+  h64 ^= h64 >> 32;
+  return h64;
+}
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+static U64 XXH64_finalize(U64 h64, const void* ptr, size_t len,
+                          XXH_endianess endian, XXH_alignment align) {
+  const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64          \
+  h64 ^= (*p++) * PRIME64_5; \
+  h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64                           \
+  h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+  p += 4;                                     \
+  h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64                                    \
+  {                                                    \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p));   \
+    p += 8;                                            \
+    h64 ^= k1;                                         \
+    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \
+  }
+
+  switch (len & 31) {
+    case 24:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 16:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 8:
+      PROCESS8_64;
+      return XXH64_avalanche(h64);
+
+    case 28:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 20:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 12:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 4:
+      PROCESS4_64;
+      return XXH64_avalanche(h64);
+
+    case 25:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 17:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 9:
+      PROCESS8_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 29:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 21:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 13:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 5:
+      PROCESS4_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 26:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 18:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 10:
+      PROCESS8_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 30:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 22:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 14:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 6:
+      PROCESS4_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 27:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 19:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 11:
+      PROCESS8_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 31:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 23:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 15:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 7:
+      PROCESS4_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 3:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 2:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 1:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 0:
+      return XXH64_avalanche(h64);
+  }
+
+  /* impossible to reach */
+  assert(0);
+  return 0; /* unreachable, but some compilers complain without it */
+}
+
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed,
+                                    XXH_endianess endian, XXH_alignment align) {
+  const BYTE* p = (const BYTE*)input;
+  const BYTE* bEnd = p + len;
+  U64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+  if (p == NULL) {
+    len = 0;
+    bEnd = p = (const BYTE*)(size_t)32;
+  }
+#endif
+
+  if (len >= 32) {
+    const BYTE* const limit = bEnd - 32;
+    U64 v1 = seed + PRIME64_1 + PRIME64_2;
+    U64 v2 = seed + PRIME64_2;
+    U64 v3 = seed + 0;
+    U64 v4 = seed - PRIME64_1;
+
+    do {
+      v1 = XXH64_round(v1, XXH_get64bits(p));
+      p += 8;
+      v2 = XXH64_round(v2, XXH_get64bits(p));
+      p += 8;
+      v3 = XXH64_round(v3, XXH_get64bits(p));
+      p += 8;
+      v4 = XXH64_round(v4, XXH_get64bits(p));
+      p += 8;
+    } while (p <= limit);
+
+    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
+          XXH_rotl64(v4, 18);
+    h64 = XXH64_mergeRound(h64, v1);
+    h64 = XXH64_mergeRound(h64, v2);
+    h64 = XXH64_mergeRound(h64, v3);
+    h64 = XXH64_mergeRound(h64, v4);
+
+  } else {
+    h64 = seed + PRIME64_5;
+  }
+
+  h64 += (U64)len;
+
+  return XXH64_finalize(h64, p, len, endian, align);
+}
+
+unsigned long long XXH64(const void* input, size_t len,
+                         unsigned long long seed) {
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
+#else
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if (XXH_FORCE_ALIGN_CHECK) {
+    if ((((size_t)input) & 7) ==
+        0) { /* Input is aligned, let's leverage the speed advantage */
+      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian,
+                                  XXH_aligned);
+      else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+  }
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_endian_align(input, len, seed, XXH_littleEndian,
+                              XXH_unaligned);
+  else
+    return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+/*======   Hash Streaming   ======*/
+
+XXH64_state_t* XXH64_createState(void) {
+  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {
+  XXH_free(statePtr);
+  return XXH_OK;
+}
+
+void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) {
+  memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) {
+  XXH64_state_t state; /* using a local state to memcpy() in order to avoid
+                          strict-aliasing warnings */
+  memset(&state, 0, sizeof(state));
+  state.v1 = seed + PRIME64_1 + PRIME64_2;
+  state.v2 = seed + PRIME64_2;
+  state.v3 = seed + 0;
+  state.v4 = seed - PRIME64_1;
+  /* do not write into reserved, planned to be removed in a future version */
+  memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+  return XXH_OK;
+}
+
+FORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state,
+                                               const void* input, size_t len,
+                                               XXH_endianess endian) {
+  if (input == NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+    return XXH_OK;
+#else
+    return XXH_ERROR;
+#endif
+
+  {
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) { /* fill in tmp buffer */
+      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+      state->memsize += (U32)len;
+      return XXH_OK;
+    }
+
+    if (state->memsize) { /* tmp buffer is full */
+      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input,
+                 32 - state->memsize);
+      state->v1 =
+          XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));
+      state->v2 =
+          XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));
+      state->v3 =
+          XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));
+      state->v4 =
+          XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));
+      p += 32 - state->memsize;
+      state->memsize = 0;
+    }
+
+    if (p + 32 <= bEnd) {
+      const BYTE* const limit = bEnd - 32;
+      U64 v1 = state->v1;
+      U64 v2 = state->v2;
+      U64 v3 = state->v3;
+      U64 v4 = state->v4;
+
+      do {
+        v1 = XXH64_round(v1, XXH_readLE64(p, endian));
+        p += 8;
+        v2 = XXH64_round(v2, XXH_readLE64(p, endian));
+        p += 8;
+        v3 = XXH64_round(v3, XXH_readLE64(p, endian));
+        p += 8;
+        v4 = XXH64_round(v4, XXH_readLE64(p, endian));
+        p += 8;
+      } while (p <= limit);
+
+      state->v1 = v1;
+      state->v2 = v2;
+      state->v3 = v3;
+      state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+      XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
+      state->memsize = (unsigned)(bEnd - p);
+    }
+  }
+
+  return XXH_OK;
+}
+
+XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input,
+                           size_t len) {
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+  else
+    return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state,
+                                     XXH_endianess endian) {
+  U64 h64;
+
+  if (state->total_len >= 32) {
+    U64 const v1 = state->v1;
+    U64 const v2 = state->v2;
+    U64 const v3 = state->v3;
+    U64 const v4 = state->v4;
+
+    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
+          XXH_rotl64(v4, 18);
+    h64 = XXH64_mergeRound(h64, v1);
+    h64 = XXH64_mergeRound(h64, v2);
+    h64 = XXH64_mergeRound(h64, v3);
+    h64 = XXH64_mergeRound(h64, v4);
+  } else {
+    h64 = state->v3 /*seed*/ + PRIME64_5;
+  }
+
+  h64 += (U64)state->total_len;
+
+  return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian,
+                        XXH_aligned);
+}
+
+unsigned long long XXH64_digest(const XXH64_state_t* state_in) {
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_digest_endian(state_in, XXH_littleEndian);
+  else
+    return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+/*====== Canonical representation   ======*/
+
+void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) {
+  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+  memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) {
+  return XXH_readBE64(src);
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/xxhash.h b/src/rocksdb/util/xxhash.h
new file mode 100644
index 00000000..88352ac7
--- /dev/null
+++ b/src/rocksdb/util/xxhash.h
@@ -0,0 +1,240 @@
+/*
+   xxHash - Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2014, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : http://code.google.com/p/xxhash/
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+*/
+
+#pragma once
+
+#include <stdlib.h>
+
+#if !defined(__VMS) &&       \
+    (defined(__cplusplus) || \
+     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
+#include <stdint.h>
+#endif
+
+#if defined (__cplusplus)
+namespace rocksdb {
+#endif
+
+
+//****************************
+// Type
+//****************************
+/* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+
+//****************************
+// Simple Hash Functions
+//****************************
+
+unsigned int XXH32 (const void* input, int len, unsigned int seed);
+
+/*
+XXH32() :
+    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
+    The memory between input & input+len must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    This function successfully passes all SMHasher tests.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+    Note that "len" is type "int", which means it is limited to 2^31-1.
+    If your data is larger, use the advanced functions below.
+*/
+
+
+
+//****************************
+// Advanced Hash Functions
+//****************************
+
+void*         XXH32_init   (unsigned int seed);
+XXH_errorcode XXH32_update (void* state, const void* input, int len);
+unsigned int  XXH32_digest (void* state);
+
+/*
+These functions calculate the xxhash of an input provided in several small packets,
+as opposed to an input provided as a single block.
+
+It must be started with :
+void* XXH32_init()
+The function returns a pointer which holds the state of calculation.
+
+This pointer must be provided as "void* state" parameter for XXH32_update().
+XXH32_update() can be called as many times as necessary.
+The user must provide a valid (allocated) input.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+Note that "len" is type "int", which means it is limited to 2^31-1.
+If your data is larger, it is recommended to chunk your data into blocks
+of size for example 2^30 (1GB) to avoid any "int" overflow issue.
+
+Finally, you can end the calculation anytime, by using XXH32_digest().
+This function returns the final 32-bits hash.
+You must provide the same "void* state" parameter created by XXH32_init().
+Memory will be freed by XXH32_digest().
+*/
+
+
+int           XXH32_sizeofState();
+XXH_errorcode XXH32_resetState(void* state, unsigned int seed);
+
+#define       XXH32_SIZEOFSTATE 48
+typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
+/*
+These functions allow user application to make its own allocation for state.
+
+XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
+Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
+This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+
+For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
+use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
+*/
+
+
+unsigned int XXH32_intermediateDigest (void* state);
+/*
+This function does the same as XXH32_digest(), generating a 32-bit hash,
+but preserve memory context.
+This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
+To free memory context, use XXH32_digest(), or free().
+*/
+
+
+
+//****************************
+// Deprecated function names
+//****************************
+// The following translations are provided to ease code transition
+// You are encouraged to no longer this function names
+#define XXH32_feed   XXH32_update
+#define XXH32_result XXH32_digest
+#define XXH32_getIntermediateResult XXH32_intermediateDigest
+
+/*-**********************************************************************
+ *  64-bit hash
+ ************************************************************************/
+typedef unsigned long long XXH64_hash_t;
+
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory
+   address "input". "seed" can be used to alter the result predictably. This
+   function runs faster on 64-bit systems, but slower on 32-bit systems (see
+   benchmark).
+*/
+XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
+void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input,
+                           size_t length);
+XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);
+
+/*======   Canonical representation   ======*/
+typedef struct {
+  unsigned char digest[8];
+} XXH64_canonical_t;
+void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+#if !defined(__VMS) &&       \
+    (defined(__cplusplus) || \
+     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
+
+struct XXH64_state_s {
+  uint64_t total_len;
+  uint64_t v1;
+  uint64_t v2;
+  uint64_t v3;
+  uint64_t v4;
+  uint64_t mem64[4];
+  uint32_t memsize;
+  uint32_t reserved[2]; /* never read nor write, might be removed in a future
+                           version */
+};                      /* typedef'd to XXH64_state_t */
+
+#else
+
+#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */
+struct XXH64_state_s {
+  unsigned long long total_len;
+  unsigned long long v1;
+  unsigned long long v2;
+  unsigned long long v3;
+  unsigned long long v4;
+  unsigned long long mem64[4];
+  unsigned memsize;
+  unsigned reserved[2]; /* never read nor write, might be removed in a future
+                           version */
+};                      /* typedef'd to XXH64_state_t */
+#endif
+
+#endif
+
+#if defined (__cplusplus)
+}  // namespace rocksdb
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/rocksdb/util
parent	Initial commit. (diff)
download	ceph-upstream.tar.xz ceph-upstream.zip